In [1]:
#Luis Enrique García
#Cod. 200111907
#Login Uniandes: Luis.garcial@uniandes.edu.co

# “Al entregar la solución de este parcial, yo, LUIS ENRIQUE GARCIA LEYVA con código 200111907 me comprometo a 
# no conversar durante el desarrollo de este examen con ninguna persona que no sea el profesor del curso, 
# sobre aspectos relacionados con el parcial; tampoco utilizaré algún medio de comunicación por voz, texto o
# intercambio de archivos, para consultar o compartir con otros, información sobre el tema del parcial. Soy
# consciente y acepto las consecuencias que acarreará para mi desempeño académico cometer fraude en este
# parcial”.
In [2]:
import pandas as pd, sklearn as sl, numpy as np, matplotlib.pyplot as plt, seaborn as sns
import pylev
import re
import warnings
warnings.filterwarnings("ignore")
In [3]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap.umap_ import UMAP
from random import randint
from datetime import datetime

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error
from pandas_profiling import ProfileReport


%matplotlib inline
In [4]:
# Se carga el dataFrame
LosAlpesHistorydf = pd.read_csv(r"C:\Users\LGARCIA\OneDrive - Valorem\LGARCIA\Documents\04. Estudio\05. Maestria\Ciencia Aplicada de Datos\CAD_Parcial1\Data\losalpes_history.csv")

#PUNTO 1: Un análisis exploratorio y de calidad de datos a partir de los datos históricos suministrados. En la medida de lo posible, incluya 
# técnicas de análisis univariado,bivariado, gráficas y no gráficas. Realice la limpieza y preparación correspondiente para que puedan ser consumidos 
# por los algoritmos de machine learning que va a utilizar. Adjunte la evidencia de como quedan los datos preparados, explique y justifique el proceso implementado
In [5]:
# Antes de empezar a entrenar el modelo vamos a revisar una limpieza de datos variable a variable empezando por las variables tipo object: Estandarizando 
# los valores y quitando valores NaN. Posteriormente a esto, revisaremos los valores númericos para quitar valores incoherentes y valores Nan. Por último
# ELiminaremos registros duplicados. 

LosAlpesHistorydf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102083 entries, 0 to 102082
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   102083 non-null  int64  
 1   neighbourhood group  101463 non-null  object 
 2   neighbourhood        101476 non-null  object 
 3   lat                  101484 non-null  float64
 4   long                 101484 non-null  float64
 5   country              100967 non-null  object 
 6   instant_bookable     101387 non-null  object 
 7   cancellation_policy  101416 non-null  object 
 8   room type            101492 non-null  object 
 9   construction year    101279 non-null  float64
 10  price                101245 non-null  object 
 11  service fee          101220 non-null  object 
 12  minimum nights       101089 non-null  float64
 13  availability 365     101049 non-null  float64
 14  number of reviews    101313 non-null  float64
 15  review rate number   101174 non-null  float64
dtypes: float64(7), int64(1), object(8)
memory usage: 12.5+ MB
In [6]:
# Limpieza #1: Variable Neighbourhood group. Empezaremos por quitar los valores NaN. Posteriormente a esto revisaremos los posibles valores y 
# definiremos el conjunto de valores posibles. Posteriormente a esto mediante la función levenshtein llevaremos los valores encontrados hacia 
# el conjunto definido.

LosAlpesHistorydf['neighbourhood group'].unique()
Out[6]:
array(['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Staten Island', nan,
       'Quens', 'Manattan', 'Broolkyn', 'Manhatan', 'brookln', 'manhatan'],
      dtype=object)
In [7]:
LosAlpesHistorydf['neighbourhood group']=LosAlpesHistorydf['neighbourhood group'].replace(np.nan, 'No Information')
In [8]:
neighbourhood_group_list = ['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'State Island','No Information']
In [9]:
LosAlpesHistorydf.loc[~LosAlpesHistorydf["neighbourhood group"].isin(neighbourhood_group_list), "neighbourhood group"].unique()
Out[9]:
array(['Staten Island', 'Quens', 'Manattan', 'Broolkyn', 'Manhatan',
       'brookln', 'manhatan'], dtype=object)
In [10]:
def corregirneighbourhoodGroup1(x):
    distancia = 1
    if pylev.levenshtein('Bronx', x) <= distancia: 
        return 'Bronx'
    elif pylev.levenshtein('Brooklyn', x) <= distancia:
        return 'Brooklyn'
    elif pylev.levenshtein('Manhattan', x) <= distancia:
        return 'Manhattan'
    elif pylev.levenshtein('Queens', x) <= distancia:
        return 'Queens'
    elif pylev.levenshtein('Staten Island', x) <= distancia:
        return 'State Island'
    elif pylev.levenshtein('No Information', x) <= distancia:
        return 'No Information'
    else:
        return x
In [11]:
def corregirneighbourhoodGroup2(x):
    distancia = 2
    if pylev.levenshtein('Bronx', x) <= distancia: 
        return 'Bronx'
    elif pylev.levenshtein('Brooklyn', x) <= distancia:
        return 'Brooklyn'
    elif pylev.levenshtein('Manhattan', x) <= distancia:
        return 'Manhattan'
    elif pylev.levenshtein('Queens', x) <= distancia:
        return 'Queens'
    elif pylev.levenshtein('Staten Island', x) <= distancia:
        return 'State Island'
    elif pylev.levenshtein('No Information', x) <= distancia:
        return 'No Information'
    else:
        return x
In [12]:
LosAlpesHistorydf.loc[~LosAlpesHistorydf["neighbourhood group"].isin(neighbourhood_group_list), "neighbourhood group"].unique()
Out[12]:
array(['Staten Island', 'Quens', 'Manattan', 'Broolkyn', 'Manhatan',
       'brookln', 'manhatan'], dtype=object)
In [13]:
LosAlpesHistorydf['neighbourhood group']=LosAlpesHistorydf['neighbourhood group'].apply(corregirneighbourhoodGroup1)
LosAlpesHistorydf.loc[~LosAlpesHistorydf["neighbourhood group"].isin(neighbourhood_group_list), "neighbourhood group"].unique()
Out[13]:
array(['Broolkyn', 'brookln', 'manhatan'], dtype=object)
In [14]:
LosAlpesHistorydf['neighbourhood group']=LosAlpesHistorydf['neighbourhood group'].apply(corregirneighbourhoodGroup2)
LosAlpesHistorydf.loc[~LosAlpesHistorydf["neighbourhood group"].isin(neighbourhood_group_list), "neighbourhood group"].unique()
Out[14]:
array([], dtype=object)
In [15]:
LosAlpesHistorydf['neighbourhood group'].unique()
Out[15]:
array(['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'State Island',
       'No Information'], dtype=object)
In [16]:
# Limpieza #2: Vamos a verificar la Variable neighbourhood que esperamos tenga un volumen alto de posibles valores.
In [17]:
LosAlpesHistorydf['neighbourhood'].unique()
Out[17]:
array(["Hell's Kitchen", 'Midtown', 'Upper West Side',
       'Financial District', 'Lower East Side', 'Greenwich Village',
       'Morningside Heights', 'Crown Heights', 'Astoria', 'Clinton Hill',
       'Harlem', 'Bushwick', 'Washington Heights', 'Woodside',
       'Prospect-Lefferts Gardens', 'Prospect Heights', 'East Village',
       'Bath Beach', 'Brooklyn Heights', 'Borough Park',
       'Bedford-Stuyvesant', 'Upper East Side', 'Wakefield',
       'Windsor Terrace', 'Williamsburg', 'Flatbush', 'Kew Gardens',
       'West Village', 'Flatlands', 'Bay Ridge', 'East New York',
       'Kips Bay', 'St. Albans', 'St. George', 'Forest Hills', 'Gramercy',
       'Glendale', 'Dongan Hills', 'Far Rockaway', 'Park Slope',
       'Greenpoint', 'Concourse', 'Little Italy', nan, 'Elmhurst',
       'Chelsea', 'East Harlem', 'Kingsbridge', 'Arverne', 'Sunset Park',
       'South Slope', 'Nolita', 'East Flatbush', 'Murray Hill',
       'Boerum Hill', 'Ridgewood', 'Coney Island', 'Gowanus',
       'Bensonhurst', 'Cambria Heights', 'Cobble Hill',
       'Long Island City', 'Flushing', 'Tribeca', 'Inwood', 'Briarwood',
       'Spuyten Duyvil', 'Concord', 'NoHo', 'Randall Manor', 'Fordham',
       'Springfield Gardens', 'Fort Greene', 'Corona', 'Ditmars Steinway',
       'Cypress Hills', 'Chinatown', 'East Elmhurst', 'Flatiron District',
       'Jamaica', 'Sheepshead Bay', 'Downtown Brooklyn', 'Edenwald',
       'Kensington', 'Oakwood', 'Eltingville', 'Theater District',
       'Gravesend', 'Schuylerville', 'SoHo', 'Middle Village', 'Canarsie',
       'Richmond Hill', 'Sunnyside', 'Jackson Heights', 'Parkchester',
       'Mott Haven', 'Maspeth', 'Stapleton', 'Carroll Gardens',
       'Midland Beach', 'Midwood', 'Van Nest', 'Rego Park',
       'Fort Hamilton', 'University Heights', 'Woodhaven',
       'Brighton Beach', 'Rosebank', 'Williamsbridge', 'Port Morris',
       'Vinegar Hill', 'Whitestone', 'Bayside', 'Longwood',
       'Port Richmond', 'Laurelton', 'Roosevelt Island', 'Tremont',
       'Throgs Neck', 'Unionport', 'Olinville', 'Civic Center',
       'City Island', 'Red Hook', 'Clason Point', 'Battery Park City',
       'Highbridge', 'Queens Village', 'Todt Hill', 'Rockaway Beach',
       'Pelham Gardens', 'Rossville', 'Concourse Village', 'New Brighton',
       'Brownsville', 'Baychester', 'South Ozone Park', 'Bronxdale',
       'Hunts Point', 'Tompkinsville', 'Ozone Park', 'College Point',
       'Mount Eden', 'DUMBO', 'Morris Park', 'Gerritsen Beach',
       'Fresh Meadows', 'Westchester Square', 'Marble Hill',
       'Shore Acres', 'Jamaica Hills', 'Kew Gardens Hills', 'Two Bridges',
       'Claremont Village', 'Bayswater', 'Columbia St', 'Morrisania',
       'Jamaica Estates', 'Belle Harbor', 'Howard Beach',
       'Lighthouse Hill', 'Norwood', 'Mariners Harbor', 'Rosedale',
       'Navy Yard', "Bull's Head", 'Fieldston', 'North Riverdale',
       'Bergen Beach', 'Manhattan Beach', 'Grant City', 'Douglaston',
       'Emerson Hill', 'Bay Terrace, Staten Island', 'Eastchester',
       'Woodlawn', "Prince's Bay", 'Allerton', 'Stuyvesant Town',
       'Belmont', 'Castle Hill', 'Arrochar', 'Edgemere',
       'Castleton Corners', 'Grymes Hill', 'Pelham Bay',
       'East Morrisania', 'Riverdale', 'Bellerose', 'South Beach',
       'Hollis', 'Arden Heights', 'Clifton', 'West Brighton',
       'Mount Hope', 'Howland Hook', 'Little Neck', 'Dyker Heights',
       'Soundview', 'Sea Gate', 'Morris Heights', 'Breezy Point',
       'Neponsit', 'Melrose', 'New Dorp Beach', 'Co-op City', 'Huguenot',
       'Great Kills', 'New Springville', 'Mill Basin', 'West Farms',
       'Chelsea, Staten Island', 'Bay Terrace', 'Tottenville',
       'Holliswood', 'Glen Oaks', 'Richmondtown', 'Silver Lake',
       'Graniteville', 'Willowbrook', 'Westerleigh', 'Woodrow',
       'New Dorp', 'Fort Wadsworth'], dtype=object)
In [18]:
# Al ver la gran variedad de valores diferentes, vamos a hacer una remediación sencilla. Vamos a quitar 
# espacios.
In [19]:
LosAlpesHistorydf['neighbourhood'].str.strip()
Out[19]:
0             Hell's Kitchen
1                    Midtown
2            Upper West Side
3         Financial District
4            Lower East Side
                 ...        
102078      Prospect Heights
102079          Williamsburg
102080              Rosedale
102081       Upper West Side
102082      Long Island City
Name: neighbourhood, Length: 102083, dtype: object
In [20]:
# Limpieza #3: Pasamos a la siguiente variable. El pais. 
LosAlpesHistorydf['country'].unique()
Out[20]:
array(['United States', nan, 'United States of America'], dtype=object)
In [21]:
# Vemos que solo hay una posibilidad "Estados Unidos", así que la vamos asignar a la fuerza. Aunque de inicio vemos que no va a ser una variable
# importante para el modelo de predicción.
# Hacemos un Replace Directo sobre el campo para la limpieza de datos univariable.
In [22]:
LosAlpesHistorydf['country'].replace('United States of America','United States', inplace=True)
LosAlpesHistorydf['country']="United States"
In [23]:
LosAlpesHistorydf['country'].unique()
Out[23]:
array(['United States'], dtype=object)
In [24]:
# Limpieza #4: Verificamos la variable Reserva Instantanea. En esta se esperan dos posibles valores True or False. En caso de valores NaN, estos serán
# reemplazados por False
In [25]:
LosAlpesHistorydf['instant_bookable'].unique()
Out[25]:
array([True, False, nan], dtype=object)
In [26]:
LosAlpesHistorydf['instant_bookable']=LosAlpesHistorydf['instant_bookable'].replace(np.nan, False)
In [27]:
LosAlpesHistorydf['instant_bookable'].unique()
Out[27]:
array([ True, False])
In [28]:
# Limpieza #5: Verificamos la variable Politicas de Cancelación. Esperamos un conjunto de valores reducido. En caso de tener valores Nan 
# se reemplazarán por "Sin información".

LosAlpesHistorydf['cancellation_policy'].unique()
Out[28]:
array(['strict', 'flexible', 'moderate', nan], dtype=object)
In [29]:
LosAlpesHistorydf['cancellation_policy']=LosAlpesHistorydf['cancellation_policy'].replace(np.nan, "No Information")
In [30]:
LosAlpesHistorydf['cancellation_policy'].unique()
Out[30]:
array(['strict', 'flexible', 'moderate', 'No Information'], dtype=object)
In [31]:
# Limpieza #6: Verificamos la variable Tipo de Habitación. Esperamos un conjunto de valores reducido. En caso de tener valores Nan 
# se reemplazarán por "Sin información".
In [32]:
LosAlpesHistorydf['room type'].unique()
Out[32]:
array(['Entire home/apt', 'Hotel room', 'Private room', 'Shared room',
       nan], dtype=object)
In [33]:
LosAlpesHistorydf['room type']=LosAlpesHistorydf['room type'].replace(np.nan, "No Information")
In [34]:
LosAlpesHistorydf['room type'].unique()
Out[34]:
array(['Entire home/apt', 'Hotel room', 'Private room', 'Shared room',
       'No Information'], dtype=object)
In [35]:
# Limpieza #7: Nos causa curiosidad que los campos precio y Service Fee sean strings y no campos numericos, así que solo
# para verificar voy a sacar un unique y ver todos los valores

LosAlpesHistorydf['price'].unique()
Out[35]:
array(['$ 851', '$ 466', '$ 874', '$ 813', '$ 326', '$ 786', '$ 863',
       '$ 749', '$ 397', '$ 462', '$ 121', '$ 1,08', '$ 443', '$ 707',
       '$ 925', '$ 997', '$ 887', '$ 760', '$ 671', '$ 1,15', '$ 277',
       '$ 1,01', '$ 389', '$ 799', '$ 1,06', '$ 828', '$ 833', '$ 306',
       '$ 115', '$ 836', '$ 80', '$ 198', '$ 866', '$ 555', '$ 1,04',
       '$ 1,14', '$ 279', '$ 685', '$ 381', '$ 784', '$ 723', '$ 103',
       '$ 215', '$ 992', '$ 592', '$ 164', '$ 90', '$ 781', '$ 404',
       '$ 72', '$ 175', '$ 494', '$ 960', '$ 272', '$ 232', '$ 1,16',
       '$ 428', '$ 780', '$ 223', '$ 112', '$ 155', '$ 104', '$ 902',
       '$ 865', '$ 119', '$ 501', '$ 607', '$ 1,02', '$ 283', '$ 271',
       '$ 276', '$ 541', nan, '$ 593', '$ 212', '$ 1,19', '$ 838',
       '$ 553', '$ 172', '$ 1,03', '$ 482', '$ 522', '$ 168', '$ 899',
       '$ 1,12', '$ 475', '$ 186', '$ 585', '$ 508', '$ 1,13', '$ 1,11',
       '$ 303', '$ 1,10', '$ 611', '$ 251', '$ 896', '$ 296', '$ 583',
       '$ 1,00', '$ 106', '$ 814', '$ 544', '$ 847', '$ 577', '$ 615',
       '$ 598', '$ 299', '$ 806', '$ 60', '$ 986', '$ 463', '$ 339',
       '$ 1,07', '$ 926', '$ 480', '$ 490', '$ 563', '$ 153', '$ 145',
       '$ 350', '$ 728', '$ 536', '$ 969', '$ 948', '$ 448', '$ 609',
       '$ 367', '$ 379', '$ 618', '$ 697', '$ 291', '$ 107', '$ 777',
       '$ 377', '$ 628', '$ 968', '$ 344', '$ 1,05', '$ 617', '$ 1,18',
       '$ 658', '$ 185', '$ 376', '$ 190', '$ 683', '$ 455', '$ 521',
       '$ 238', '$ 827', '$ 129', '$ 237', '$ 975', '$ 327', '$ 437',
       '$ 317', '$ 681', '$ 353', '$ 205', '$ 558', '$ 978', '$ 808',
       '$ 823', '$ 695', '$ 295', '$ 1,09', '$ 916', '$ 793', '$ 837',
       '$ 216', '$ 288', '$ 402', '$ 909', '$ 136', '$ 955', '$ 354',
       '$ 464', '$ 368', '$ 627', '$ 52', '$ 956', '$ 471', '$ 110',
       '$ 209', '$ 388', '$ 744', '$ 182', '$ 980', '$ 746', '$ 807',
       '$ 659', '$ 571', '$ 517', '$ 535', '$ 323', '$ 735', '$ 967',
       '$ 311', '$ 566', '$ 62', '$ 708', '$ 50', '$ 96', '$ 599',
       '$ 614', '$ 499', '$ 734', '$ 54', '$ 588', '$ 792', '$ 698',
       '$ 549', '$ 419', '$ 788', '$ 890', '$ 369', '$ 79', '$ 416',
       '$ 991', '$ 759', '$ 753', '$ 709', '$ 979', '$ 414', '$ 973',
       '$ 557', '$ 1,17', '$ 370', '$ 947', '$ 415', '$ 399', '$ 989',
       '$ 450', '$ 882', '$ 398', '$ 800', '$ 766', '$ 886', '$ 334',
       '$ 885', '$ 931', '$ 76', '$ 211', '$ 481', '$ 297', '$ 656',
       '$ 717', '$ 946', '$ 258', '$ 412', '$ 400', '$ 266', '$ 171',
       '$ 910', '$ 672', '$ 495', '$ 959', '$ 595', '$ 460', '$ 51',
       '$ 278', '$ 420', '$ 81', '$ 912', '$ 927', '$ 214', '$ 257',
       '$ 140', '$ 253', '$ 538', '$ 952', '$ 151', '$ 755', '$ 733',
       '$ 473', '$ 873', '$ 998', '$ 78', '$ 572', '$ 383', '$ 846',
       '$ 655', '$ 231', '$ 293', '$ 883', '$ 362', '$ 418', '$ 504',
       '$ 653', '$ 895', '$ 879', '$ 228', '$ 976', '$ 265', '$ 310',
       '$ 61', '$ 150', '$ 639', '$ 616', '$ 177', '$ 782', '$ 479',
       '$ 325', '$ 130', '$ 425', '$ 502', '$ 645', '$ 856', '$ 165',
       '$ 196', '$ 394', '$ 661', '$ 634', '$ 358', '$ 562', '$ 580',
       '$ 305', '$ 783', '$ 631', '$ 731', '$ 722', '$ 207', '$ 467',
       '$ 203', '$ 921', '$ 488', '$ 309', '$ 128', '$ 233', '$ 221',
       '$ 514', '$ 605', '$ 849', '$ 762', '$ 942', '$ 691', '$ 355',
       '$ 635', '$ 820', '$ 610', '$ 469', '$ 805', '$ 234', '$ 752',
       '$ 511', '$ 917', '$ 246', '$ 529', '$ 75', '$ 676', '$ 93',
       '$ 982', '$ 92', '$ 996', '$ 977', '$ 141', '$ 565', '$ 456',
       '$ 888', '$ 561', '$ 364', '$ 650', '$ 184', '$ 270', '$ 804',
       '$ 222', '$ 604', '$ 688', '$ 858', '$ 71', '$ 950', '$ 220',
       '$ 798', '$ 747', '$ 934', '$ 626', '$ 133', '$ 539', '$ 91',
       '$ 817', '$ 603', '$ 601', '$ 714', '$ 531', '$ 855', '$ 523',
       '$ 392', '$ 189', '$ 174', '$ 342', '$ 657', '$ 810', '$ 204',
       '$ 314', '$ 142', '$ 158', '$ 684', '$ 386', '$ 920', '$ 526',
       '$ 519', '$ 131', '$ 848', '$ 452', '$ 864', '$ 719', '$ 533',
       '$ 622', '$ 169', '$ 318', '$ 796', '$ 625', '$ 984', '$ 256',
       '$ 772', '$ 1,20', '$ 87', '$ 638', '$ 407', '$ 758', '$ 892',
       '$ 144', '$ 941', '$ 426', '$ 191', '$ 686', '$ 468', '$ 701',
       '$ 951', '$ 505', '$ 797', '$ 496', '$ 943', '$ 834', '$ 700',
       '$ 623', '$ 629', '$ 346', '$ 286', '$ 53', '$ 802', '$ 285',
       '$ 192', '$ 356', '$ 385', '$ 543', '$ 953', '$ 181', '$ 590',
       '$ 59', '$ 646', '$ 918', '$ 195', '$ 880', '$ 905', '$ 867',
       '$ 282', '$ 763', '$ 640', '$ 429', '$ 461', '$ 328', '$ 567',
       '$ 729', '$ 660', '$ 236', '$ 809', '$ 569', '$ 527', '$ 244',
       '$ 861', '$ 243', '$ 933', '$ 673', '$ 872', '$ 713', '$ 881',
       '$ 330', '$ 453', '$ 936', '$ 290', '$ 667', '$ 275', '$ 302',
       '$ 606', '$ 741', '$ 108', '$ 581', '$ 971', '$ 408', '$ 170',
       '$ 520', '$ 652', '$ 132', '$ 636', '$ 67', '$ 449', '$ 219',
       '$ 756', '$ 540', '$ 778', '$ 101', '$ 438', '$ 732', '$ 785',
       '$ 771', '$ 654', '$ 55', '$ 647', '$ 289', '$ 336', '$ 575',
       '$ 69', '$ 210', '$ 341', '$ 199', '$ 126', '$ 662', '$ 351',
       '$ 432', '$ 329', '$ 712', '$ 178', '$ 160', '$ 751', '$ 82',
       '$ 380', '$ 57', '$ 149', '$ 637', '$ 97', '$ 423', '$ 924',
       '$ 619', '$ 248', '$ 994', '$ 436', '$ 987', '$ 937', '$ 179',
       '$ 694', '$ 573', '$ 724', '$ 853', '$ 372', '$ 65', '$ 218',
       '$ 877', '$ 578', '$ 988', '$ 534', '$ 677', '$ 907', '$ 893',
       '$ 903', '$ 716', '$ 484', '$ 898', '$ 842', '$ 343', '$ 312',
       '$ 396', '$ 229', '$ 889', '$ 313', '$ 127', '$ 124', '$ 825',
       '$ 434', '$ 670', '$ 935', '$ 489', '$ 850', '$ 929', '$ 441',
       '$ 474', '$ 668', '$ 644', '$ 818', '$ 194', '$ 801', '$ 391',
       '$ 845', '$ 922', '$ 574', '$ 280', '$ 427', '$ 206', '$ 515',
       '$ 718', '$ 961', '$ 773', '$ 962', '$ 579', '$ 768', '$ 73',
       '$ 498', '$ 679', '$ 525', '$ 345', '$ 387', '$ 146', '$ 649',
       '$ 491', '$ 812', '$ 682', '$ 193', '$ 143', '$ 789', '$ 576',
       '$ 742', '$ 938', '$ 868', '$ 643', '$ 347', '$ 835', '$ 117',
       '$ 840', '$ 457', '$ 442', '$ 458', '$ 767', '$ 908', '$ 255',
       '$ 608', '$ 547', '$ 58', '$ 913', '$ 95', '$ 878', '$ 570',
       '$ 157', '$ 183', '$ 84', '$ 651', '$ 564', '$ 188', '$ 860',
       '$ 162', '$ 267', '$ 803', '$ 963', '$ 56', '$ 770', '$ 584',
       '$ 983', '$ 250', '$ 594', '$ 832', '$ 200', '$ 692', '$ 891',
       '$ 260', '$ 338', '$ 454', '$ 208', '$ 114', '$ 743', '$ 830',
       '$ 111', '$ 981', '$ 166', '$ 308', '$ 965', '$ 357', '$ 568',
       '$ 261', '$ 704', '$ 247', '$ 262', '$ 363', '$ 319', '$ 954',
       '$ 137', '$ 147', '$ 240', '$ 822', '$ 669', '$ 513', '$ 405',
       '$ 268', '$ 897', '$ 148', '$ 109', '$ 699', '$ 582', '$ 559',
       '$ 167', '$ 245', '$ 64', '$ 284', '$ 472', '$ 403', '$ 413',
       '$ 711', '$ 790', '$ 102', '$ 315', '$ 765', '$ 985', '$ 478',
       '$ 957', '$ 552', '$ 715', '$ 703', '$ 648', '$ 689', '$ 242',
       '$ 945', '$ 340', '$ 687', '$ 591', '$ 745', '$ 374', '$ 138',
       '$ 371', '$ 556', '$ 411', '$ 173', '$ 444', '$ 736', '$ 944',
       '$ 831', '$ 613', '$ 993', '$ 239', '$ 844', '$ 674', '$ 542',
       '$ 395', '$ 829', '$ 915', '$ 470', '$ 939', '$ 94', '$ 665',
       '$ 506', '$ 730', '$ 869', '$ 187', '$ 120', '$ 726', '$ 966',
       '$ 445', '$ 83', '$ 70', '$ 393', '$ 487', '$ 301', '$ 862',
       '$ 721', '$ 86', '$ 999', '$ 493', '$ 560', '$ 116', '$ 139',
       '$ 424', '$ 63', '$ 66', '$ 422', '$ 664', '$ 551', '$ 269',
       '$ 923', '$ 451', '$ 225', '$ 919', '$ 273', '$ 620', '$ 349',
       '$ 906', '$ 492', '$ 928', '$ 545', '$ 163', '$ 666', '$ 819',
       '$ 201', '$ 995', '$ 740', '$ 300', '$ 894', '$ 528', '$ 720',
       '$ 821', '$ 794', '$ 680', '$ 352', '$ 550', '$ 274', '$ 264',
       '$ 737', '$ 390', '$ 911', '$ 761', '$ 901', '$ 161', '$ 98',
       '$ 254', '$ 507', '$ 641', '$ 359', '$ 375', '$ 769', '$ 348',
       '$ 972', '$ 675', '$ 693', '$ 597', '$ 159', '$ 932', '$ 324',
       '$ 373', '$ 486', '$ 99', '$ 875', '$ 725', '$ 518', '$ 546',
       '$ 586', '$ 824', '$ 621', '$ 904', '$ 417', '$ 89', '$ 554',
       '$ 68', '$ 633', '$ 612', '$ 485', '$ 779', '$ 500', '$ 406',
       '$ 642', '$ 263', '$ 587', '$ 105', '$ 100', '$ 217', '$ 705',
       '$ 213', '$ 900', '$ 859', '$ 281', '$ 964', '$ 854', '$ 476',
       '$ 990', '$ 366', '$ 930', '$ 970', '$ 630', '$ 335', '$ 409',
       '$ 811', '$ 795', '$ 235', '$ 483', '$ 77', '$ 88', '$ 259',
       '$ 857', '$ 202', '$ 361', '$ 710', '$ 663', '$ 134', '$ 884',
       '$ 602', '$ 632', '$ 678', '$ 512', '$ 421', '$ 156', '$ 249',
       '$ 439', '$ 298', '$ 477', '$ 447', '$ 510', '$ 440', '$ 382',
       '$ 321', '$ 176', '$ 85', '$ 596', '$ 152', '$ 316', '$ 876',
       '$ 839', '$ 118', '$ 757', '$ 600', '$ 624', '$ 702', '$ 852',
       '$ 509', '$ 532', '$ 826', '$ 135', '$ 180', '$ 816', '$ 292',
       '$ 365', '$ 696', '$ 738', '$ 843', '$ 871', '$ 435', '$ 524',
       '$ 307', '$ 727', '$ 224', '$ 974', '$ 774', '$ 940', '$ 332',
       '$ 764', '$ 252', '$ 333', '$ 791', '$ 401', '$ 154', '$ 503',
       '$ 113', '$ 958', '$ 230', '$ 410', '$ 431', '$ 537', '$ 331',
       '$ 706', '$ 459', '$ 750', '$ 775', '$ 516', '$ 754', '$ 548',
       '$ 304', '$ 446', '$ 815', '$ 430', '$ 322', '$ 497', '$ 337',
       '$ 841', '$ 690', '$ 227', '$ 226', '$ 530', '$ 433', '$ 74',
       '$ 360', '$ 197', '$ 384', '$ 739', '$ 949', '$ 122', '$ 787',
       '$ 914', '$ 378', '$ 870', '$ 320', '$ 123', '$ 776', '$ 241',
       '$ 125', '$ 465', '$ 287', '$ 589', '$ 294', '$ 748', '$ -611',
       '$ 254000'], dtype=object)
In [36]:
# Vemos que el problema esta con los signos pesos, algunos negativos y comas en lugar de puntos.  Empezaremos quitando el signo 
# y cambiando las compas por puntos. Tambien se cambiaran los nan por ceros.

LosAlpesHistorydf['price']=LosAlpesHistorydf['price'].str.slice_replace(stop=2, repl='')
LosAlpesHistorydf['price']=LosAlpesHistorydf['price'].replace(np.nan, '0.0')
LosAlpesHistorydf['price']=LosAlpesHistorydf['price'].replace(',', '.',regex=True)
LosAlpesHistorydf['price']=LosAlpesHistorydf['price'].astype(float, errors = 'raise')
LosAlpesHistorydf['price'].unique()
Out[36]:
array([ 8.51e+02,  4.66e+02,  8.74e+02,  8.13e+02,  3.26e+02,  7.86e+02,
        8.63e+02,  7.49e+02,  3.97e+02,  4.62e+02,  1.21e+02,  1.08e+00,
        4.43e+02,  7.07e+02,  9.25e+02,  9.97e+02,  8.87e+02,  7.60e+02,
        6.71e+02,  1.15e+00,  2.77e+02,  1.01e+00,  3.89e+02,  7.99e+02,
        1.06e+00,  8.28e+02,  8.33e+02,  3.06e+02,  1.15e+02,  8.36e+02,
        8.00e+01,  1.98e+02,  8.66e+02,  5.55e+02,  1.04e+00,  1.14e+00,
        2.79e+02,  6.85e+02,  3.81e+02,  7.84e+02,  7.23e+02,  1.03e+02,
        2.15e+02,  9.92e+02,  5.92e+02,  1.64e+02,  9.00e+01,  7.81e+02,
        4.04e+02,  7.20e+01,  1.75e+02,  4.94e+02,  9.60e+02,  2.72e+02,
        2.32e+02,  1.16e+00,  4.28e+02,  7.80e+02,  2.23e+02,  1.12e+02,
        1.55e+02,  1.04e+02,  9.02e+02,  8.65e+02,  1.19e+02,  5.01e+02,
        6.07e+02,  1.02e+00,  2.83e+02,  2.71e+02,  2.76e+02,  5.41e+02,
        0.00e+00,  5.93e+02,  2.12e+02,  1.19e+00,  8.38e+02,  5.53e+02,
        1.72e+02,  1.03e+00,  4.82e+02,  5.22e+02,  1.68e+02,  8.99e+02,
        1.12e+00,  4.75e+02,  1.86e+02,  5.85e+02,  5.08e+02,  1.13e+00,
        1.11e+00,  3.03e+02,  1.10e+00,  6.11e+02,  2.51e+02,  8.96e+02,
        2.96e+02,  5.83e+02,  1.00e+00,  1.06e+02,  8.14e+02,  5.44e+02,
        8.47e+02,  5.77e+02,  6.15e+02,  5.98e+02,  2.99e+02,  8.06e+02,
        6.00e+01,  9.86e+02,  4.63e+02,  3.39e+02,  1.07e+00,  9.26e+02,
        4.80e+02,  4.90e+02,  5.63e+02,  1.53e+02,  1.45e+02,  3.50e+02,
        7.28e+02,  5.36e+02,  9.69e+02,  9.48e+02,  4.48e+02,  6.09e+02,
        3.67e+02,  3.79e+02,  6.18e+02,  6.97e+02,  2.91e+02,  1.07e+02,
        7.77e+02,  3.77e+02,  6.28e+02,  9.68e+02,  3.44e+02,  1.05e+00,
        6.17e+02,  1.18e+00,  6.58e+02,  1.85e+02,  3.76e+02,  1.90e+02,
        6.83e+02,  4.55e+02,  5.21e+02,  2.38e+02,  8.27e+02,  1.29e+02,
        2.37e+02,  9.75e+02,  3.27e+02,  4.37e+02,  3.17e+02,  6.81e+02,
        3.53e+02,  2.05e+02,  5.58e+02,  9.78e+02,  8.08e+02,  8.23e+02,
        6.95e+02,  2.95e+02,  1.09e+00,  9.16e+02,  7.93e+02,  8.37e+02,
        2.16e+02,  2.88e+02,  4.02e+02,  9.09e+02,  1.36e+02,  9.55e+02,
        3.54e+02,  4.64e+02,  3.68e+02,  6.27e+02,  5.20e+01,  9.56e+02,
        4.71e+02,  1.10e+02,  2.09e+02,  3.88e+02,  7.44e+02,  1.82e+02,
        9.80e+02,  7.46e+02,  8.07e+02,  6.59e+02,  5.71e+02,  5.17e+02,
        5.35e+02,  3.23e+02,  7.35e+02,  9.67e+02,  3.11e+02,  5.66e+02,
        6.20e+01,  7.08e+02,  5.00e+01,  9.60e+01,  5.99e+02,  6.14e+02,
        4.99e+02,  7.34e+02,  5.40e+01,  5.88e+02,  7.92e+02,  6.98e+02,
        5.49e+02,  4.19e+02,  7.88e+02,  8.90e+02,  3.69e+02,  7.90e+01,
        4.16e+02,  9.91e+02,  7.59e+02,  7.53e+02,  7.09e+02,  9.79e+02,
        4.14e+02,  9.73e+02,  5.57e+02,  1.17e+00,  3.70e+02,  9.47e+02,
        4.15e+02,  3.99e+02,  9.89e+02,  4.50e+02,  8.82e+02,  3.98e+02,
        8.00e+02,  7.66e+02,  8.86e+02,  3.34e+02,  8.85e+02,  9.31e+02,
        7.60e+01,  2.11e+02,  4.81e+02,  2.97e+02,  6.56e+02,  7.17e+02,
        9.46e+02,  2.58e+02,  4.12e+02,  4.00e+02,  2.66e+02,  1.71e+02,
        9.10e+02,  6.72e+02,  4.95e+02,  9.59e+02,  5.95e+02,  4.60e+02,
        5.10e+01,  2.78e+02,  4.20e+02,  8.10e+01,  9.12e+02,  9.27e+02,
        2.14e+02,  2.57e+02,  1.40e+02,  2.53e+02,  5.38e+02,  9.52e+02,
        1.51e+02,  7.55e+02,  7.33e+02,  4.73e+02,  8.73e+02,  9.98e+02,
        7.80e+01,  5.72e+02,  3.83e+02,  8.46e+02,  6.55e+02,  2.31e+02,
        2.93e+02,  8.83e+02,  3.62e+02,  4.18e+02,  5.04e+02,  6.53e+02,
        8.95e+02,  8.79e+02,  2.28e+02,  9.76e+02,  2.65e+02,  3.10e+02,
        6.10e+01,  1.50e+02,  6.39e+02,  6.16e+02,  1.77e+02,  7.82e+02,
        4.79e+02,  3.25e+02,  1.30e+02,  4.25e+02,  5.02e+02,  6.45e+02,
        8.56e+02,  1.65e+02,  1.96e+02,  3.94e+02,  6.61e+02,  6.34e+02,
        3.58e+02,  5.62e+02,  5.80e+02,  3.05e+02,  7.83e+02,  6.31e+02,
        7.31e+02,  7.22e+02,  2.07e+02,  4.67e+02,  2.03e+02,  9.21e+02,
        4.88e+02,  3.09e+02,  1.28e+02,  2.33e+02,  2.21e+02,  5.14e+02,
        6.05e+02,  8.49e+02,  7.62e+02,  9.42e+02,  6.91e+02,  3.55e+02,
        6.35e+02,  8.20e+02,  6.10e+02,  4.69e+02,  8.05e+02,  2.34e+02,
        7.52e+02,  5.11e+02,  9.17e+02,  2.46e+02,  5.29e+02,  7.50e+01,
        6.76e+02,  9.30e+01,  9.82e+02,  9.20e+01,  9.96e+02,  9.77e+02,
        1.41e+02,  5.65e+02,  4.56e+02,  8.88e+02,  5.61e+02,  3.64e+02,
        6.50e+02,  1.84e+02,  2.70e+02,  8.04e+02,  2.22e+02,  6.04e+02,
        6.88e+02,  8.58e+02,  7.10e+01,  9.50e+02,  2.20e+02,  7.98e+02,
        7.47e+02,  9.34e+02,  6.26e+02,  1.33e+02,  5.39e+02,  9.10e+01,
        8.17e+02,  6.03e+02,  6.01e+02,  7.14e+02,  5.31e+02,  8.55e+02,
        5.23e+02,  3.92e+02,  1.89e+02,  1.74e+02,  3.42e+02,  6.57e+02,
        8.10e+02,  2.04e+02,  3.14e+02,  1.42e+02,  1.58e+02,  6.84e+02,
        3.86e+02,  9.20e+02,  5.26e+02,  5.19e+02,  1.31e+02,  8.48e+02,
        4.52e+02,  8.64e+02,  7.19e+02,  5.33e+02,  6.22e+02,  1.69e+02,
        3.18e+02,  7.96e+02,  6.25e+02,  9.84e+02,  2.56e+02,  7.72e+02,
        1.20e+00,  8.70e+01,  6.38e+02,  4.07e+02,  7.58e+02,  8.92e+02,
        1.44e+02,  9.41e+02,  4.26e+02,  1.91e+02,  6.86e+02,  4.68e+02,
        7.01e+02,  9.51e+02,  5.05e+02,  7.97e+02,  4.96e+02,  9.43e+02,
        8.34e+02,  7.00e+02,  6.23e+02,  6.29e+02,  3.46e+02,  2.86e+02,
        5.30e+01,  8.02e+02,  2.85e+02,  1.92e+02,  3.56e+02,  3.85e+02,
        5.43e+02,  9.53e+02,  1.81e+02,  5.90e+02,  5.90e+01,  6.46e+02,
        9.18e+02,  1.95e+02,  8.80e+02,  9.05e+02,  8.67e+02,  2.82e+02,
        7.63e+02,  6.40e+02,  4.29e+02,  4.61e+02,  3.28e+02,  5.67e+02,
        7.29e+02,  6.60e+02,  2.36e+02,  8.09e+02,  5.69e+02,  5.27e+02,
        2.44e+02,  8.61e+02,  2.43e+02,  9.33e+02,  6.73e+02,  8.72e+02,
        7.13e+02,  8.81e+02,  3.30e+02,  4.53e+02,  9.36e+02,  2.90e+02,
        6.67e+02,  2.75e+02,  3.02e+02,  6.06e+02,  7.41e+02,  1.08e+02,
        5.81e+02,  9.71e+02,  4.08e+02,  1.70e+02,  5.20e+02,  6.52e+02,
        1.32e+02,  6.36e+02,  6.70e+01,  4.49e+02,  2.19e+02,  7.56e+02,
        5.40e+02,  7.78e+02,  1.01e+02,  4.38e+02,  7.32e+02,  7.85e+02,
        7.71e+02,  6.54e+02,  5.50e+01,  6.47e+02,  2.89e+02,  3.36e+02,
        5.75e+02,  6.90e+01,  2.10e+02,  3.41e+02,  1.99e+02,  1.26e+02,
        6.62e+02,  3.51e+02,  4.32e+02,  3.29e+02,  7.12e+02,  1.78e+02,
        1.60e+02,  7.51e+02,  8.20e+01,  3.80e+02,  5.70e+01,  1.49e+02,
        6.37e+02,  9.70e+01,  4.23e+02,  9.24e+02,  6.19e+02,  2.48e+02,
        9.94e+02,  4.36e+02,  9.87e+02,  9.37e+02,  1.79e+02,  6.94e+02,
        5.73e+02,  7.24e+02,  8.53e+02,  3.72e+02,  6.50e+01,  2.18e+02,
        8.77e+02,  5.78e+02,  9.88e+02,  5.34e+02,  6.77e+02,  9.07e+02,
        8.93e+02,  9.03e+02,  7.16e+02,  4.84e+02,  8.98e+02,  8.42e+02,
        3.43e+02,  3.12e+02,  3.96e+02,  2.29e+02,  8.89e+02,  3.13e+02,
        1.27e+02,  1.24e+02,  8.25e+02,  4.34e+02,  6.70e+02,  9.35e+02,
        4.89e+02,  8.50e+02,  9.29e+02,  4.41e+02,  4.74e+02,  6.68e+02,
        6.44e+02,  8.18e+02,  1.94e+02,  8.01e+02,  3.91e+02,  8.45e+02,
        9.22e+02,  5.74e+02,  2.80e+02,  4.27e+02,  2.06e+02,  5.15e+02,
        7.18e+02,  9.61e+02,  7.73e+02,  9.62e+02,  5.79e+02,  7.68e+02,
        7.30e+01,  4.98e+02,  6.79e+02,  5.25e+02,  3.45e+02,  3.87e+02,
        1.46e+02,  6.49e+02,  4.91e+02,  8.12e+02,  6.82e+02,  1.93e+02,
        1.43e+02,  7.89e+02,  5.76e+02,  7.42e+02,  9.38e+02,  8.68e+02,
        6.43e+02,  3.47e+02,  8.35e+02,  1.17e+02,  8.40e+02,  4.57e+02,
        4.42e+02,  4.58e+02,  7.67e+02,  9.08e+02,  2.55e+02,  6.08e+02,
        5.47e+02,  5.80e+01,  9.13e+02,  9.50e+01,  8.78e+02,  5.70e+02,
        1.57e+02,  1.83e+02,  8.40e+01,  6.51e+02,  5.64e+02,  1.88e+02,
        8.60e+02,  1.62e+02,  2.67e+02,  8.03e+02,  9.63e+02,  5.60e+01,
        7.70e+02,  5.84e+02,  9.83e+02,  2.50e+02,  5.94e+02,  8.32e+02,
        2.00e+02,  6.92e+02,  8.91e+02,  2.60e+02,  3.38e+02,  4.54e+02,
        2.08e+02,  1.14e+02,  7.43e+02,  8.30e+02,  1.11e+02,  9.81e+02,
        1.66e+02,  3.08e+02,  9.65e+02,  3.57e+02,  5.68e+02,  2.61e+02,
        7.04e+02,  2.47e+02,  2.62e+02,  3.63e+02,  3.19e+02,  9.54e+02,
        1.37e+02,  1.47e+02,  2.40e+02,  8.22e+02,  6.69e+02,  5.13e+02,
        4.05e+02,  2.68e+02,  8.97e+02,  1.48e+02,  1.09e+02,  6.99e+02,
        5.82e+02,  5.59e+02,  1.67e+02,  2.45e+02,  6.40e+01,  2.84e+02,
        4.72e+02,  4.03e+02,  4.13e+02,  7.11e+02,  7.90e+02,  1.02e+02,
        3.15e+02,  7.65e+02,  9.85e+02,  4.78e+02,  9.57e+02,  5.52e+02,
        7.15e+02,  7.03e+02,  6.48e+02,  6.89e+02,  2.42e+02,  9.45e+02,
        3.40e+02,  6.87e+02,  5.91e+02,  7.45e+02,  3.74e+02,  1.38e+02,
        3.71e+02,  5.56e+02,  4.11e+02,  1.73e+02,  4.44e+02,  7.36e+02,
        9.44e+02,  8.31e+02,  6.13e+02,  9.93e+02,  2.39e+02,  8.44e+02,
        6.74e+02,  5.42e+02,  3.95e+02,  8.29e+02,  9.15e+02,  4.70e+02,
        9.39e+02,  9.40e+01,  6.65e+02,  5.06e+02,  7.30e+02,  8.69e+02,
        1.87e+02,  1.20e+02,  7.26e+02,  9.66e+02,  4.45e+02,  8.30e+01,
        7.00e+01,  3.93e+02,  4.87e+02,  3.01e+02,  8.62e+02,  7.21e+02,
        8.60e+01,  9.99e+02,  4.93e+02,  5.60e+02,  1.16e+02,  1.39e+02,
        4.24e+02,  6.30e+01,  6.60e+01,  4.22e+02,  6.64e+02,  5.51e+02,
        2.69e+02,  9.23e+02,  4.51e+02,  2.25e+02,  9.19e+02,  2.73e+02,
        6.20e+02,  3.49e+02,  9.06e+02,  4.92e+02,  9.28e+02,  5.45e+02,
        1.63e+02,  6.66e+02,  8.19e+02,  2.01e+02,  9.95e+02,  7.40e+02,
        3.00e+02,  8.94e+02,  5.28e+02,  7.20e+02,  8.21e+02,  7.94e+02,
        6.80e+02,  3.52e+02,  5.50e+02,  2.74e+02,  2.64e+02,  7.37e+02,
        3.90e+02,  9.11e+02,  7.61e+02,  9.01e+02,  1.61e+02,  9.80e+01,
        2.54e+02,  5.07e+02,  6.41e+02,  3.59e+02,  3.75e+02,  7.69e+02,
        3.48e+02,  9.72e+02,  6.75e+02,  6.93e+02,  5.97e+02,  1.59e+02,
        9.32e+02,  3.24e+02,  3.73e+02,  4.86e+02,  9.90e+01,  8.75e+02,
        7.25e+02,  5.18e+02,  5.46e+02,  5.86e+02,  8.24e+02,  6.21e+02,
        9.04e+02,  4.17e+02,  8.90e+01,  5.54e+02,  6.80e+01,  6.33e+02,
        6.12e+02,  4.85e+02,  7.79e+02,  5.00e+02,  4.06e+02,  6.42e+02,
        2.63e+02,  5.87e+02,  1.05e+02,  1.00e+02,  2.17e+02,  7.05e+02,
        2.13e+02,  9.00e+02,  8.59e+02,  2.81e+02,  9.64e+02,  8.54e+02,
        4.76e+02,  9.90e+02,  3.66e+02,  9.30e+02,  9.70e+02,  6.30e+02,
        3.35e+02,  4.09e+02,  8.11e+02,  7.95e+02,  2.35e+02,  4.83e+02,
        7.70e+01,  8.80e+01,  2.59e+02,  8.57e+02,  2.02e+02,  3.61e+02,
        7.10e+02,  6.63e+02,  1.34e+02,  8.84e+02,  6.02e+02,  6.32e+02,
        6.78e+02,  5.12e+02,  4.21e+02,  1.56e+02,  2.49e+02,  4.39e+02,
        2.98e+02,  4.77e+02,  4.47e+02,  5.10e+02,  4.40e+02,  3.82e+02,
        3.21e+02,  1.76e+02,  8.50e+01,  5.96e+02,  1.52e+02,  3.16e+02,
        8.76e+02,  8.39e+02,  1.18e+02,  7.57e+02,  6.00e+02,  6.24e+02,
        7.02e+02,  8.52e+02,  5.09e+02,  5.32e+02,  8.26e+02,  1.35e+02,
        1.80e+02,  8.16e+02,  2.92e+02,  3.65e+02,  6.96e+02,  7.38e+02,
        8.43e+02,  8.71e+02,  4.35e+02,  5.24e+02,  3.07e+02,  7.27e+02,
        2.24e+02,  9.74e+02,  7.74e+02,  9.40e+02,  3.32e+02,  7.64e+02,
        2.52e+02,  3.33e+02,  7.91e+02,  4.01e+02,  1.54e+02,  5.03e+02,
        1.13e+02,  9.58e+02,  2.30e+02,  4.10e+02,  4.31e+02,  5.37e+02,
        3.31e+02,  7.06e+02,  4.59e+02,  7.50e+02,  7.75e+02,  5.16e+02,
        7.54e+02,  5.48e+02,  3.04e+02,  4.46e+02,  8.15e+02,  4.30e+02,
        3.22e+02,  4.97e+02,  3.37e+02,  8.41e+02,  6.90e+02,  2.27e+02,
        2.26e+02,  5.30e+02,  4.33e+02,  7.40e+01,  3.60e+02,  1.97e+02,
        3.84e+02,  7.39e+02,  9.49e+02,  1.22e+02,  7.87e+02,  9.14e+02,
        3.78e+02,  8.70e+02,  3.20e+02,  1.23e+02,  7.76e+02,  2.41e+02,
        1.25e+02,  4.65e+02,  2.87e+02,  5.89e+02,  2.94e+02,  7.48e+02,
       -6.11e+02,  2.54e+05])
In [37]:
def corregirNegativo(x):
    if x > 0: 
        return x
    elif x < 0:
        return -1*x
    else: 
        return 0
In [38]:
# Corregimos los negativos suponiendo que son problemas de digitación. Esto mismo lo haremos en las siguientes variables.
LosAlpesHistorydf['price']=LosAlpesHistorydf['price'].apply(corregirNegativo)
In [39]:
LosAlpesHistorydf['price'].unique()
Out[39]:
array([8.51e+02, 4.66e+02, 8.74e+02, 8.13e+02, 3.26e+02, 7.86e+02,
       8.63e+02, 7.49e+02, 3.97e+02, 4.62e+02, 1.21e+02, 1.08e+00,
       4.43e+02, 7.07e+02, 9.25e+02, 9.97e+02, 8.87e+02, 7.60e+02,
       6.71e+02, 1.15e+00, 2.77e+02, 1.01e+00, 3.89e+02, 7.99e+02,
       1.06e+00, 8.28e+02, 8.33e+02, 3.06e+02, 1.15e+02, 8.36e+02,
       8.00e+01, 1.98e+02, 8.66e+02, 5.55e+02, 1.04e+00, 1.14e+00,
       2.79e+02, 6.85e+02, 3.81e+02, 7.84e+02, 7.23e+02, 1.03e+02,
       2.15e+02, 9.92e+02, 5.92e+02, 1.64e+02, 9.00e+01, 7.81e+02,
       4.04e+02, 7.20e+01, 1.75e+02, 4.94e+02, 9.60e+02, 2.72e+02,
       2.32e+02, 1.16e+00, 4.28e+02, 7.80e+02, 2.23e+02, 1.12e+02,
       1.55e+02, 1.04e+02, 9.02e+02, 8.65e+02, 1.19e+02, 5.01e+02,
       6.07e+02, 1.02e+00, 2.83e+02, 2.71e+02, 2.76e+02, 5.41e+02,
       0.00e+00, 5.93e+02, 2.12e+02, 1.19e+00, 8.38e+02, 5.53e+02,
       1.72e+02, 1.03e+00, 4.82e+02, 5.22e+02, 1.68e+02, 8.99e+02,
       1.12e+00, 4.75e+02, 1.86e+02, 5.85e+02, 5.08e+02, 1.13e+00,
       1.11e+00, 3.03e+02, 1.10e+00, 6.11e+02, 2.51e+02, 8.96e+02,
       2.96e+02, 5.83e+02, 1.00e+00, 1.06e+02, 8.14e+02, 5.44e+02,
       8.47e+02, 5.77e+02, 6.15e+02, 5.98e+02, 2.99e+02, 8.06e+02,
       6.00e+01, 9.86e+02, 4.63e+02, 3.39e+02, 1.07e+00, 9.26e+02,
       4.80e+02, 4.90e+02, 5.63e+02, 1.53e+02, 1.45e+02, 3.50e+02,
       7.28e+02, 5.36e+02, 9.69e+02, 9.48e+02, 4.48e+02, 6.09e+02,
       3.67e+02, 3.79e+02, 6.18e+02, 6.97e+02, 2.91e+02, 1.07e+02,
       7.77e+02, 3.77e+02, 6.28e+02, 9.68e+02, 3.44e+02, 1.05e+00,
       6.17e+02, 1.18e+00, 6.58e+02, 1.85e+02, 3.76e+02, 1.90e+02,
       6.83e+02, 4.55e+02, 5.21e+02, 2.38e+02, 8.27e+02, 1.29e+02,
       2.37e+02, 9.75e+02, 3.27e+02, 4.37e+02, 3.17e+02, 6.81e+02,
       3.53e+02, 2.05e+02, 5.58e+02, 9.78e+02, 8.08e+02, 8.23e+02,
       6.95e+02, 2.95e+02, 1.09e+00, 9.16e+02, 7.93e+02, 8.37e+02,
       2.16e+02, 2.88e+02, 4.02e+02, 9.09e+02, 1.36e+02, 9.55e+02,
       3.54e+02, 4.64e+02, 3.68e+02, 6.27e+02, 5.20e+01, 9.56e+02,
       4.71e+02, 1.10e+02, 2.09e+02, 3.88e+02, 7.44e+02, 1.82e+02,
       9.80e+02, 7.46e+02, 8.07e+02, 6.59e+02, 5.71e+02, 5.17e+02,
       5.35e+02, 3.23e+02, 7.35e+02, 9.67e+02, 3.11e+02, 5.66e+02,
       6.20e+01, 7.08e+02, 5.00e+01, 9.60e+01, 5.99e+02, 6.14e+02,
       4.99e+02, 7.34e+02, 5.40e+01, 5.88e+02, 7.92e+02, 6.98e+02,
       5.49e+02, 4.19e+02, 7.88e+02, 8.90e+02, 3.69e+02, 7.90e+01,
       4.16e+02, 9.91e+02, 7.59e+02, 7.53e+02, 7.09e+02, 9.79e+02,
       4.14e+02, 9.73e+02, 5.57e+02, 1.17e+00, 3.70e+02, 9.47e+02,
       4.15e+02, 3.99e+02, 9.89e+02, 4.50e+02, 8.82e+02, 3.98e+02,
       8.00e+02, 7.66e+02, 8.86e+02, 3.34e+02, 8.85e+02, 9.31e+02,
       7.60e+01, 2.11e+02, 4.81e+02, 2.97e+02, 6.56e+02, 7.17e+02,
       9.46e+02, 2.58e+02, 4.12e+02, 4.00e+02, 2.66e+02, 1.71e+02,
       9.10e+02, 6.72e+02, 4.95e+02, 9.59e+02, 5.95e+02, 4.60e+02,
       5.10e+01, 2.78e+02, 4.20e+02, 8.10e+01, 9.12e+02, 9.27e+02,
       2.14e+02, 2.57e+02, 1.40e+02, 2.53e+02, 5.38e+02, 9.52e+02,
       1.51e+02, 7.55e+02, 7.33e+02, 4.73e+02, 8.73e+02, 9.98e+02,
       7.80e+01, 5.72e+02, 3.83e+02, 8.46e+02, 6.55e+02, 2.31e+02,
       2.93e+02, 8.83e+02, 3.62e+02, 4.18e+02, 5.04e+02, 6.53e+02,
       8.95e+02, 8.79e+02, 2.28e+02, 9.76e+02, 2.65e+02, 3.10e+02,
       6.10e+01, 1.50e+02, 6.39e+02, 6.16e+02, 1.77e+02, 7.82e+02,
       4.79e+02, 3.25e+02, 1.30e+02, 4.25e+02, 5.02e+02, 6.45e+02,
       8.56e+02, 1.65e+02, 1.96e+02, 3.94e+02, 6.61e+02, 6.34e+02,
       3.58e+02, 5.62e+02, 5.80e+02, 3.05e+02, 7.83e+02, 6.31e+02,
       7.31e+02, 7.22e+02, 2.07e+02, 4.67e+02, 2.03e+02, 9.21e+02,
       4.88e+02, 3.09e+02, 1.28e+02, 2.33e+02, 2.21e+02, 5.14e+02,
       6.05e+02, 8.49e+02, 7.62e+02, 9.42e+02, 6.91e+02, 3.55e+02,
       6.35e+02, 8.20e+02, 6.10e+02, 4.69e+02, 8.05e+02, 2.34e+02,
       7.52e+02, 5.11e+02, 9.17e+02, 2.46e+02, 5.29e+02, 7.50e+01,
       6.76e+02, 9.30e+01, 9.82e+02, 9.20e+01, 9.96e+02, 9.77e+02,
       1.41e+02, 5.65e+02, 4.56e+02, 8.88e+02, 5.61e+02, 3.64e+02,
       6.50e+02, 1.84e+02, 2.70e+02, 8.04e+02, 2.22e+02, 6.04e+02,
       6.88e+02, 8.58e+02, 7.10e+01, 9.50e+02, 2.20e+02, 7.98e+02,
       7.47e+02, 9.34e+02, 6.26e+02, 1.33e+02, 5.39e+02, 9.10e+01,
       8.17e+02, 6.03e+02, 6.01e+02, 7.14e+02, 5.31e+02, 8.55e+02,
       5.23e+02, 3.92e+02, 1.89e+02, 1.74e+02, 3.42e+02, 6.57e+02,
       8.10e+02, 2.04e+02, 3.14e+02, 1.42e+02, 1.58e+02, 6.84e+02,
       3.86e+02, 9.20e+02, 5.26e+02, 5.19e+02, 1.31e+02, 8.48e+02,
       4.52e+02, 8.64e+02, 7.19e+02, 5.33e+02, 6.22e+02, 1.69e+02,
       3.18e+02, 7.96e+02, 6.25e+02, 9.84e+02, 2.56e+02, 7.72e+02,
       1.20e+00, 8.70e+01, 6.38e+02, 4.07e+02, 7.58e+02, 8.92e+02,
       1.44e+02, 9.41e+02, 4.26e+02, 1.91e+02, 6.86e+02, 4.68e+02,
       7.01e+02, 9.51e+02, 5.05e+02, 7.97e+02, 4.96e+02, 9.43e+02,
       8.34e+02, 7.00e+02, 6.23e+02, 6.29e+02, 3.46e+02, 2.86e+02,
       5.30e+01, 8.02e+02, 2.85e+02, 1.92e+02, 3.56e+02, 3.85e+02,
       5.43e+02, 9.53e+02, 1.81e+02, 5.90e+02, 5.90e+01, 6.46e+02,
       9.18e+02, 1.95e+02, 8.80e+02, 9.05e+02, 8.67e+02, 2.82e+02,
       7.63e+02, 6.40e+02, 4.29e+02, 4.61e+02, 3.28e+02, 5.67e+02,
       7.29e+02, 6.60e+02, 2.36e+02, 8.09e+02, 5.69e+02, 5.27e+02,
       2.44e+02, 8.61e+02, 2.43e+02, 9.33e+02, 6.73e+02, 8.72e+02,
       7.13e+02, 8.81e+02, 3.30e+02, 4.53e+02, 9.36e+02, 2.90e+02,
       6.67e+02, 2.75e+02, 3.02e+02, 6.06e+02, 7.41e+02, 1.08e+02,
       5.81e+02, 9.71e+02, 4.08e+02, 1.70e+02, 5.20e+02, 6.52e+02,
       1.32e+02, 6.36e+02, 6.70e+01, 4.49e+02, 2.19e+02, 7.56e+02,
       5.40e+02, 7.78e+02, 1.01e+02, 4.38e+02, 7.32e+02, 7.85e+02,
       7.71e+02, 6.54e+02, 5.50e+01, 6.47e+02, 2.89e+02, 3.36e+02,
       5.75e+02, 6.90e+01, 2.10e+02, 3.41e+02, 1.99e+02, 1.26e+02,
       6.62e+02, 3.51e+02, 4.32e+02, 3.29e+02, 7.12e+02, 1.78e+02,
       1.60e+02, 7.51e+02, 8.20e+01, 3.80e+02, 5.70e+01, 1.49e+02,
       6.37e+02, 9.70e+01, 4.23e+02, 9.24e+02, 6.19e+02, 2.48e+02,
       9.94e+02, 4.36e+02, 9.87e+02, 9.37e+02, 1.79e+02, 6.94e+02,
       5.73e+02, 7.24e+02, 8.53e+02, 3.72e+02, 6.50e+01, 2.18e+02,
       8.77e+02, 5.78e+02, 9.88e+02, 5.34e+02, 6.77e+02, 9.07e+02,
       8.93e+02, 9.03e+02, 7.16e+02, 4.84e+02, 8.98e+02, 8.42e+02,
       3.43e+02, 3.12e+02, 3.96e+02, 2.29e+02, 8.89e+02, 3.13e+02,
       1.27e+02, 1.24e+02, 8.25e+02, 4.34e+02, 6.70e+02, 9.35e+02,
       4.89e+02, 8.50e+02, 9.29e+02, 4.41e+02, 4.74e+02, 6.68e+02,
       6.44e+02, 8.18e+02, 1.94e+02, 8.01e+02, 3.91e+02, 8.45e+02,
       9.22e+02, 5.74e+02, 2.80e+02, 4.27e+02, 2.06e+02, 5.15e+02,
       7.18e+02, 9.61e+02, 7.73e+02, 9.62e+02, 5.79e+02, 7.68e+02,
       7.30e+01, 4.98e+02, 6.79e+02, 5.25e+02, 3.45e+02, 3.87e+02,
       1.46e+02, 6.49e+02, 4.91e+02, 8.12e+02, 6.82e+02, 1.93e+02,
       1.43e+02, 7.89e+02, 5.76e+02, 7.42e+02, 9.38e+02, 8.68e+02,
       6.43e+02, 3.47e+02, 8.35e+02, 1.17e+02, 8.40e+02, 4.57e+02,
       4.42e+02, 4.58e+02, 7.67e+02, 9.08e+02, 2.55e+02, 6.08e+02,
       5.47e+02, 5.80e+01, 9.13e+02, 9.50e+01, 8.78e+02, 5.70e+02,
       1.57e+02, 1.83e+02, 8.40e+01, 6.51e+02, 5.64e+02, 1.88e+02,
       8.60e+02, 1.62e+02, 2.67e+02, 8.03e+02, 9.63e+02, 5.60e+01,
       7.70e+02, 5.84e+02, 9.83e+02, 2.50e+02, 5.94e+02, 8.32e+02,
       2.00e+02, 6.92e+02, 8.91e+02, 2.60e+02, 3.38e+02, 4.54e+02,
       2.08e+02, 1.14e+02, 7.43e+02, 8.30e+02, 1.11e+02, 9.81e+02,
       1.66e+02, 3.08e+02, 9.65e+02, 3.57e+02, 5.68e+02, 2.61e+02,
       7.04e+02, 2.47e+02, 2.62e+02, 3.63e+02, 3.19e+02, 9.54e+02,
       1.37e+02, 1.47e+02, 2.40e+02, 8.22e+02, 6.69e+02, 5.13e+02,
       4.05e+02, 2.68e+02, 8.97e+02, 1.48e+02, 1.09e+02, 6.99e+02,
       5.82e+02, 5.59e+02, 1.67e+02, 2.45e+02, 6.40e+01, 2.84e+02,
       4.72e+02, 4.03e+02, 4.13e+02, 7.11e+02, 7.90e+02, 1.02e+02,
       3.15e+02, 7.65e+02, 9.85e+02, 4.78e+02, 9.57e+02, 5.52e+02,
       7.15e+02, 7.03e+02, 6.48e+02, 6.89e+02, 2.42e+02, 9.45e+02,
       3.40e+02, 6.87e+02, 5.91e+02, 7.45e+02, 3.74e+02, 1.38e+02,
       3.71e+02, 5.56e+02, 4.11e+02, 1.73e+02, 4.44e+02, 7.36e+02,
       9.44e+02, 8.31e+02, 6.13e+02, 9.93e+02, 2.39e+02, 8.44e+02,
       6.74e+02, 5.42e+02, 3.95e+02, 8.29e+02, 9.15e+02, 4.70e+02,
       9.39e+02, 9.40e+01, 6.65e+02, 5.06e+02, 7.30e+02, 8.69e+02,
       1.87e+02, 1.20e+02, 7.26e+02, 9.66e+02, 4.45e+02, 8.30e+01,
       7.00e+01, 3.93e+02, 4.87e+02, 3.01e+02, 8.62e+02, 7.21e+02,
       8.60e+01, 9.99e+02, 4.93e+02, 5.60e+02, 1.16e+02, 1.39e+02,
       4.24e+02, 6.30e+01, 6.60e+01, 4.22e+02, 6.64e+02, 5.51e+02,
       2.69e+02, 9.23e+02, 4.51e+02, 2.25e+02, 9.19e+02, 2.73e+02,
       6.20e+02, 3.49e+02, 9.06e+02, 4.92e+02, 9.28e+02, 5.45e+02,
       1.63e+02, 6.66e+02, 8.19e+02, 2.01e+02, 9.95e+02, 7.40e+02,
       3.00e+02, 8.94e+02, 5.28e+02, 7.20e+02, 8.21e+02, 7.94e+02,
       6.80e+02, 3.52e+02, 5.50e+02, 2.74e+02, 2.64e+02, 7.37e+02,
       3.90e+02, 9.11e+02, 7.61e+02, 9.01e+02, 1.61e+02, 9.80e+01,
       2.54e+02, 5.07e+02, 6.41e+02, 3.59e+02, 3.75e+02, 7.69e+02,
       3.48e+02, 9.72e+02, 6.75e+02, 6.93e+02, 5.97e+02, 1.59e+02,
       9.32e+02, 3.24e+02, 3.73e+02, 4.86e+02, 9.90e+01, 8.75e+02,
       7.25e+02, 5.18e+02, 5.46e+02, 5.86e+02, 8.24e+02, 6.21e+02,
       9.04e+02, 4.17e+02, 8.90e+01, 5.54e+02, 6.80e+01, 6.33e+02,
       6.12e+02, 4.85e+02, 7.79e+02, 5.00e+02, 4.06e+02, 6.42e+02,
       2.63e+02, 5.87e+02, 1.05e+02, 1.00e+02, 2.17e+02, 7.05e+02,
       2.13e+02, 9.00e+02, 8.59e+02, 2.81e+02, 9.64e+02, 8.54e+02,
       4.76e+02, 9.90e+02, 3.66e+02, 9.30e+02, 9.70e+02, 6.30e+02,
       3.35e+02, 4.09e+02, 8.11e+02, 7.95e+02, 2.35e+02, 4.83e+02,
       7.70e+01, 8.80e+01, 2.59e+02, 8.57e+02, 2.02e+02, 3.61e+02,
       7.10e+02, 6.63e+02, 1.34e+02, 8.84e+02, 6.02e+02, 6.32e+02,
       6.78e+02, 5.12e+02, 4.21e+02, 1.56e+02, 2.49e+02, 4.39e+02,
       2.98e+02, 4.77e+02, 4.47e+02, 5.10e+02, 4.40e+02, 3.82e+02,
       3.21e+02, 1.76e+02, 8.50e+01, 5.96e+02, 1.52e+02, 3.16e+02,
       8.76e+02, 8.39e+02, 1.18e+02, 7.57e+02, 6.00e+02, 6.24e+02,
       7.02e+02, 8.52e+02, 5.09e+02, 5.32e+02, 8.26e+02, 1.35e+02,
       1.80e+02, 8.16e+02, 2.92e+02, 3.65e+02, 6.96e+02, 7.38e+02,
       8.43e+02, 8.71e+02, 4.35e+02, 5.24e+02, 3.07e+02, 7.27e+02,
       2.24e+02, 9.74e+02, 7.74e+02, 9.40e+02, 3.32e+02, 7.64e+02,
       2.52e+02, 3.33e+02, 7.91e+02, 4.01e+02, 1.54e+02, 5.03e+02,
       1.13e+02, 9.58e+02, 2.30e+02, 4.10e+02, 4.31e+02, 5.37e+02,
       3.31e+02, 7.06e+02, 4.59e+02, 7.50e+02, 7.75e+02, 5.16e+02,
       7.54e+02, 5.48e+02, 3.04e+02, 4.46e+02, 8.15e+02, 4.30e+02,
       3.22e+02, 4.97e+02, 3.37e+02, 8.41e+02, 6.90e+02, 2.27e+02,
       2.26e+02, 5.30e+02, 4.33e+02, 7.40e+01, 3.60e+02, 1.97e+02,
       3.84e+02, 7.39e+02, 9.49e+02, 1.22e+02, 7.87e+02, 9.14e+02,
       3.78e+02, 8.70e+02, 3.20e+02, 1.23e+02, 7.76e+02, 2.41e+02,
       1.25e+02, 4.65e+02, 2.87e+02, 5.89e+02, 2.94e+02, 7.48e+02,
       2.54e+05])
In [40]:
# Limpieza #8: Vamos a aplicar lo mismo con el campo service Fee aunque nos ahorramos el split al no tener decimales. 
# Toca volver a hacer la corrección de negativos para el fee negativo y convertir el nan en 0
In [41]:
LosAlpesHistorydf['service fee'].unique()
Out[41]:
array(['$ 170', '$ 93', '$ 175', '$ 163', '$ 65', '$ 157', '$ 173',
       '$ 150', '$ 79', '$ 92', '$ 24', '$ 216', '$ 89', '$ 141', '$ 185',
       '$ 199', '$ 177', '$ 152', '$ 134', '$ 229', '$ 55', '$ 203',
       '$ 78', '$ 160', '$ 212', '$ 166', '$ 167', '$ 61', '$ 23',
       '$ 230', '$ 16', '$ 40', '$ 111', '$ 207', '$ 56', '$ 137', '$ 76',
       '$ 145', '$ 21', '$ 43', '$ 198', '$ 118', '$ 33', '$ 18', '$ 156',
       '$ 81', '$ 14', '$ 35', '$ 99', '$ 227', '$ 192', '$ 54', '$ 46',
       '$ 232', '$ 86', '$ 45', '$ 22', '$ 31', '$ 180', '$ 100', '$ 121',
       '$ 204', '$ 57', '$ 108', nan, '$ 119', '$ 42', '$ 238', '$ 168',
       '$ 34', '$ 206', '$ 96', '$ 104', '$ 233', '$ 224', '$ 95', '$ 37',
       '$ 117', '$ 102', '$ 226', '$ 221', '$ 219', '$ 122', '$ 50',
       '$ 179', '$ 220', '$ 59', '$ 200', '$ 109', '$ 169', '$ 115',
       '$ 123', '$ 120', '$ 60', '$ 161', '$ 12', '$ 197', '$ 68',
       '$ 213', '$ 98', '$ 113', '$ 29', '$ 70', '$ 146', '$ 107',
       '$ 194', '$ 190', '$ 90', '$ 73', '$ 124', '$ 231', '$ 139',
       '$ 58', '$ 155', '$ 75', '$ 126', '$ 69', '$ 210', '$ 236',
       '$ 132', '$ 38', '$ 91', '$ 48', '$ 165', '$ 26', '$ 47', '$ 195',
       '$ 87', '$ 63', '$ 136', '$ 71', '$ 41', '$ 112', '$ 196', '$ 162',
       '$ 218', '$ 183', '$ 159', '$ 80', '$ 182', '$ 27', '$ 191',
       '$ 74', '$ 125', '$ 10', '$ 208', '$ 94', '$ 149', '$ 36', '$ 114',
       '$ 103', '$ 147', '$ 193', '$ 209', '$ 142', '$ 19', '$ 11',
       '$ 158', '$ 140', '$ 110', '$ 84', '$ 178', '$ 83', '$ 151',
       '$ 234', '$ 189', '$ 176', '$ 237', '$ 153', '$ 67', '$ 186',
       '$ 15', '$ 205', '$ 131', '$ 143', '$ 52', '$ 82', '$ 53', '$ 51',
       '$ 28', '$ 30', '$ 77', '$ 72', '$ 101', '$ 225', '$ 62', '$ 128',
       '$ 217', '$ 85', '$ 129', '$ 171', '$ 39', '$ 222', '$ 127',
       '$ 116', '$ 144', '$ 184', '$ 44', '$ 188', '$ 138', '$ 164',
       '$ 49', '$ 211', '$ 106', '$ 135', '$ 223', '$ 130', '$ 172',
       '$ 187', '$ 105', '$ 32', '$ 215', '$ 64', '$ 154', '$ 239',
       '$ 17', '$ 228', '$ 181', '$ 66', '$ 174', '$ 133', '$ 148',
       '$ 13', '$ 214', '$ 201', '$ 20', '$ 88', '$ 235', '$ 25', '$ 240',
       '$ 97', '$ 202', '$ 122000', '$ -193'], dtype=object)
In [42]:
LosAlpesHistorydf['service fee']=LosAlpesHistorydf['service fee'].str.slice_replace(stop=2, repl='')
LosAlpesHistorydf['service fee']=LosAlpesHistorydf['service fee'].replace(np.nan, '0.0')
LosAlpesHistorydf['service fee']=LosAlpesHistorydf['service fee'].replace(',', '.',regex=True)
LosAlpesHistorydf['service fee']=LosAlpesHistorydf['service fee'].astype(float, errors = 'raise')
In [43]:
LosAlpesHistorydf['service fee']=LosAlpesHistorydf['service fee'].apply(corregirNegativo)
In [44]:
LosAlpesHistorydf['service fee'].unique()
Out[44]:
array([1.70e+02, 9.30e+01, 1.75e+02, 1.63e+02, 6.50e+01, 1.57e+02,
       1.73e+02, 1.50e+02, 7.90e+01, 9.20e+01, 2.40e+01, 2.16e+02,
       8.90e+01, 1.41e+02, 1.85e+02, 1.99e+02, 1.77e+02, 1.52e+02,
       1.34e+02, 2.29e+02, 5.50e+01, 2.03e+02, 7.80e+01, 1.60e+02,
       2.12e+02, 1.66e+02, 1.67e+02, 6.10e+01, 2.30e+01, 2.30e+02,
       1.60e+01, 4.00e+01, 1.11e+02, 2.07e+02, 5.60e+01, 1.37e+02,
       7.60e+01, 1.45e+02, 2.10e+01, 4.30e+01, 1.98e+02, 1.18e+02,
       3.30e+01, 1.80e+01, 1.56e+02, 8.10e+01, 1.40e+01, 3.50e+01,
       9.90e+01, 2.27e+02, 1.92e+02, 5.40e+01, 4.60e+01, 2.32e+02,
       8.60e+01, 4.50e+01, 2.20e+01, 3.10e+01, 1.80e+02, 1.00e+02,
       1.21e+02, 2.04e+02, 5.70e+01, 1.08e+02, 0.00e+00, 1.19e+02,
       4.20e+01, 2.38e+02, 1.68e+02, 3.40e+01, 2.06e+02, 9.60e+01,
       1.04e+02, 2.33e+02, 2.24e+02, 9.50e+01, 3.70e+01, 1.17e+02,
       1.02e+02, 2.26e+02, 2.21e+02, 2.19e+02, 1.22e+02, 5.00e+01,
       1.79e+02, 2.20e+02, 5.90e+01, 2.00e+02, 1.09e+02, 1.69e+02,
       1.15e+02, 1.23e+02, 1.20e+02, 6.00e+01, 1.61e+02, 1.20e+01,
       1.97e+02, 6.80e+01, 2.13e+02, 9.80e+01, 1.13e+02, 2.90e+01,
       7.00e+01, 1.46e+02, 1.07e+02, 1.94e+02, 1.90e+02, 9.00e+01,
       7.30e+01, 1.24e+02, 2.31e+02, 1.39e+02, 5.80e+01, 1.55e+02,
       7.50e+01, 1.26e+02, 6.90e+01, 2.10e+02, 2.36e+02, 1.32e+02,
       3.80e+01, 9.10e+01, 4.80e+01, 1.65e+02, 2.60e+01, 4.70e+01,
       1.95e+02, 8.70e+01, 6.30e+01, 1.36e+02, 7.10e+01, 4.10e+01,
       1.12e+02, 1.96e+02, 1.62e+02, 2.18e+02, 1.83e+02, 1.59e+02,
       8.00e+01, 1.82e+02, 2.70e+01, 1.91e+02, 7.40e+01, 1.25e+02,
       1.00e+01, 2.08e+02, 9.40e+01, 1.49e+02, 3.60e+01, 1.14e+02,
       1.03e+02, 1.47e+02, 1.93e+02, 2.09e+02, 1.42e+02, 1.90e+01,
       1.10e+01, 1.58e+02, 1.40e+02, 1.10e+02, 8.40e+01, 1.78e+02,
       8.30e+01, 1.51e+02, 2.34e+02, 1.89e+02, 1.76e+02, 2.37e+02,
       1.53e+02, 6.70e+01, 1.86e+02, 1.50e+01, 2.05e+02, 1.31e+02,
       1.43e+02, 5.20e+01, 8.20e+01, 5.30e+01, 5.10e+01, 2.80e+01,
       3.00e+01, 7.70e+01, 7.20e+01, 1.01e+02, 2.25e+02, 6.20e+01,
       1.28e+02, 2.17e+02, 8.50e+01, 1.29e+02, 1.71e+02, 3.90e+01,
       2.22e+02, 1.27e+02, 1.16e+02, 1.44e+02, 1.84e+02, 4.40e+01,
       1.88e+02, 1.38e+02, 1.64e+02, 4.90e+01, 2.11e+02, 1.06e+02,
       1.35e+02, 2.23e+02, 1.30e+02, 1.72e+02, 1.87e+02, 1.05e+02,
       3.20e+01, 2.15e+02, 6.40e+01, 1.54e+02, 2.39e+02, 1.70e+01,
       2.28e+02, 1.81e+02, 6.60e+01, 1.74e+02, 1.33e+02, 1.48e+02,
       1.30e+01, 2.14e+02, 2.01e+02, 2.00e+01, 8.80e+01, 2.35e+02,
       2.50e+01, 2.40e+02, 9.70e+01, 2.02e+02, 1.22e+05])
In [45]:
# Limpieza #9: para los valores numericos vamos a utilizar la función corregir negativo. Esto partiendo del hecho de que si existen valores negativos serán errores
# de digitación (los volveremos positivos) y si hay un NaN quedará en cero.

LosAlpesHistorydf['service fee']=LosAlpesHistorydf['service fee'].apply(corregirNegativo)
LosAlpesHistorydf['minimum nights']=LosAlpesHistorydf['minimum nights'].apply(corregirNegativo)
LosAlpesHistorydf['availability 365']=LosAlpesHistorydf['availability 365'].apply(corregirNegativo)
LosAlpesHistorydf['number of reviews']=LosAlpesHistorydf['number of reviews'].apply(corregirNegativo)
LosAlpesHistorydf['review rate number']=LosAlpesHistorydf['review rate number'].apply(corregirNegativo)
In [46]:
# Limpieza #10: Para el año de construcción, esperarmos outlayers y valores nan. Depende de lo poco logicos de los valores se tomarán acciones.
LosAlpesHistorydf['construction year'].unique()
Out[46]:
array([2011., 2021., 2004., 2012., 2007., 2022., 2008., 2009., 2006.,
       2003., 2014., 2005., 2020., 2015., 2016., 2013., 2018., 2017.,
       2019., 2010.,   nan, 1020., 1022.])
In [47]:
def corregirAño(x):
    if x > 2002: 
        return x
    elif x <= 1022:
        return x + 1000
    else: 
        return 2002
In [48]:
LosAlpesHistorydf['construction year']=LosAlpesHistorydf['construction year'].apply(corregirAño)
In [49]:
LosAlpesHistorydf['construction year'].unique()
Out[49]:
array([2011., 2021., 2004., 2012., 2007., 2022., 2008., 2009., 2006.,
       2003., 2014., 2005., 2020., 2015., 2016., 2013., 2018., 2017.,
       2019., 2010., 2002.])
In [50]:
# Limpieza #11: Se crea una función para corregir las disponibilidades mayores a 365 tomando como base que ningun sitio puede tener mas de 365 de disponibilidad en un año.

def corregirDisponibilidad(x):
    if x < 366: 
        return x
    elif x >= 366:
        return 365
    else: 
        return 365
In [51]:
LosAlpesHistorydf['availability 365']=LosAlpesHistorydf['availability 365'].apply(corregirDisponibilidad)
In [52]:
# Limpieza #12: por ultimo, vamos a quitar duplicados. 

LosAlpesHistorydf.drop_duplicates(keep='first', inplace=True)
LosAlpesHistorydf.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 101547 entries, 0 to 102082
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   101547 non-null  int64  
 1   neighbourhood group  101547 non-null  object 
 2   neighbourhood        100946 non-null  object 
 3   lat                  100954 non-null  float64
 4   long                 100954 non-null  float64
 5   country              101547 non-null  object 
 6   instant_bookable     101547 non-null  bool   
 7   cancellation_policy  101547 non-null  object 
 8   room type            101547 non-null  object 
 9   construction year    101547 non-null  float64
 10  price                101547 non-null  float64
 11  service fee          101547 non-null  float64
 12  minimum nights       101547 non-null  float64
 13  availability 365     101547 non-null  float64
 14  number of reviews    101547 non-null  float64
 15  review rate number   101547 non-null  float64
dtypes: bool(1), float64(9), int64(1), object(5)
memory usage: 12.5+ MB
In [53]:
# Posterior a la limpieza de datos, vamos a hacer un pequeño analisis bivariado tomando como base la zona o grupo de vecindario y el numero de reviews
# esto me podrá dar luz sobre nuevas varias (numero de reviews por apartamento en la zona) y así tener mayor conocimiento del comportamiento de reviews
# en el set de datos.

df_grafica= LosAlpesHistorydf[['neighbourhood group','number of reviews']].groupby(['neighbourhood group']).sum('number of reviews')
df_grafica=df_grafica.sort_values('neighbourhood group', ascending=True).reset_index()
df_grafica.plot('neighbourhood group','number of reviews', kind = 'bar')
Out[53]:
<AxesSubplot:xlabel='neighbourhood group'>
In [54]:
df_grafica2 = LosAlpesHistorydf[['neighbourhood group']].groupby(['neighbourhood group']).size().reset_index(name='num_registers')
df_grafica2=df_grafica2.sort_values('neighbourhood group', ascending=True).reset_index()
df_grafica2.plot('neighbourhood group','num_registers', kind = 'bar')
Out[54]:
<AxesSubplot:xlabel='neighbourhood group'>
In [55]:
df_grafica['num_registers']=df_grafica2['num_registers']
df_grafica['Average Review By Location']=df_grafica['number of reviews']/df_grafica['num_registers']
df_grafica=df_grafica.sort_values('Average Review By Location', ascending=False)
In [56]:
df_grafica.plot('neighbourhood group','Average Review By Location', kind = 'bar')
Out[56]:
<AxesSubplot:xlabel='neighbourhood group'>
In [57]:
# Con esta relación entendemos que el numero promedio de review por zona no varia demasiado entre Bronx, Queens y State Island. 
df_grafica
Out[57]:
neighbourhood group number of reviews num_registers Average Review By Location
5 State Island 33428.0 937 35.675560
4 Queens 435165.0 12952 33.598286
0 Bronx 83639.0 2649 31.573801
1 Brooklyn 1172642.0 41236 28.437336
2 Manhattan 1038884.0 43159 24.071086
3 No Information 3048.0 614 4.964169
In [58]:
df_grafica3 = LosAlpesHistorydf[['availability 365','number of reviews']].groupby(['availability 365']).sum('number of reviews')
df_grafica3 = df_grafica3.sort_values('availability 365', ascending=True).reset_index()
df_grafica4 = LosAlpesHistorydf[['availability 365']].groupby(['availability 365']).size().reset_index(name='num_registers')
df_grafica4 = df_grafica4.sort_values('availability 365', ascending=True).reset_index()
In [59]:
df_grafica3=df_grafica3.reset_index()
In [60]:
df_grafica3['num_registers']=df_grafica4['num_registers']
In [61]:
df_grafica3['Average by Day']=df_grafica3['number of reviews']/df_grafica3['num_registers']
In [62]:
df_grafica3
Out[62]:
index availability 365 number of reviews num_registers Average by Day
0 0 0.0 253266.0 24283 10.429766
1 1 1.0 22446.0 774 29.000000
2 2 2.0 11801.0 533 22.140713
3 3 3.0 16354.0 614 26.635179
4 4 4.0 11680.0 474 24.641350
... ... ... ... ... ...
361 361 361.0 11279.0 276 40.865942
362 362 362.0 8486.0 375 22.629333
363 363 363.0 9571.0 500 19.142000
364 364 364.0 17439.0 1153 15.124892
365 365 365.0 129927.0 5178 25.092121

366 rows × 5 columns

In [63]:
# PUNTO 2: Muestre los resultados tras el entrenamiento y selección del mejor modelo obtenido a partir de los datos suministrados. Utilice las 
# técnicas de transformación de datos, selección de atributos y regularización que considere convenientes. Explique cómo este modelo responde al 
# objetivo planteado y si los resultados serían, de acuerdo a su criterio, suficientes para su uso por parte de la inmobiliaria. Recuerde argumentar 
# sus decisiones con base en la interpretación cuantitativa y cualitativa de los resultados.
In [64]:
# Primero vamos a preparar el modelo para su entrenamiento
enc = OneHotEncoder(handle_unknown='ignore')
salida = enc.fit(LosAlpesHistorydf)
In [65]:
# Posterior a esto transformamos los datos. Country no lo hacemos por tener un solo valor.
le = LabelEncoder()
LosAlpesHistorydf['neighbourhood group encoded']=le.fit_transform(LosAlpesHistorydf['neighbourhood group'])
LosAlpesHistorydf['neighbourhood encoded']=le.fit_transform(LosAlpesHistorydf['neighbourhood'])
LosAlpesHistorydf['cancellation_policy encoded']=le.fit_transform(LosAlpesHistorydf['cancellation_policy'])
LosAlpesHistorydf['room type encoded']=le.fit_transform(LosAlpesHistorydf['room type'])
In [66]:
LosAlpesHistorydf.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 101547 entries, 0 to 102082
Data columns (total 20 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           101547 non-null  int64  
 1   neighbourhood group          101547 non-null  object 
 2   neighbourhood                100946 non-null  object 
 3   lat                          100954 non-null  float64
 4   long                         100954 non-null  float64
 5   country                      101547 non-null  object 
 6   instant_bookable             101547 non-null  bool   
 7   cancellation_policy          101547 non-null  object 
 8   room type                    101547 non-null  object 
 9   construction year            101547 non-null  float64
 10  price                        101547 non-null  float64
 11  service fee                  101547 non-null  float64
 12  minimum nights               101547 non-null  float64
 13  availability 365             101547 non-null  float64
 14  number of reviews            101547 non-null  float64
 15  review rate number           101547 non-null  float64
 16  neighbourhood group encoded  101547 non-null  int32  
 17  neighbourhood encoded        101547 non-null  int32  
 18  cancellation_policy encoded  101547 non-null  int32  
 19  room type encoded            101547 non-null  int32  
dtypes: bool(1), float64(9), int32(4), int64(1), object(5)
memory usage: 14.0+ MB
In [67]:
# En este caso quité columnas id, latitud, longitud, país (todo es EEUU) que no aportan mayor valor al analisis. Luego revisaremos la correlación entre variables 
# para definir que otras variables aportan poco valor.

LosAlpesHistoryDFV2 = LosAlpesHistorydf[['instant_bookable', 'room type encoded','construction year','price','service fee','minimum nights','availability 365','number of reviews','neighbourhood group encoded','cancellation_policy encoded','neighbourhood encoded']]
In [68]:
LosAlpesHistoryDFV2.drop_duplicates(keep='first', inplace=True)
In [69]:
LosAlpesHistoryDFV2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 94973 entries, 0 to 102082
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   instant_bookable             94973 non-null  bool   
 1   room type encoded            94973 non-null  int32  
 2   construction year            94973 non-null  float64
 3   price                        94973 non-null  float64
 4   service fee                  94973 non-null  float64
 5   minimum nights               94973 non-null  float64
 6   availability 365             94973 non-null  float64
 7   number of reviews            94973 non-null  float64
 8   neighbourhood group encoded  94973 non-null  int32  
 9   cancellation_policy encoded  94973 non-null  int32  
 10  neighbourhood encoded        94973 non-null  int32  
dtypes: bool(1), float64(6), int32(4)
memory usage: 6.6 MB
In [70]:
LosAlpesHistoryDFV2.corr()
Out[70]:
instant_bookable room type encoded construction year price service fee minimum nights availability 365 number of reviews neighbourhood group encoded cancellation_policy encoded neighbourhood encoded
instant_bookable 1.000000 0.000588 0.002866 -0.001680 0.003341 -0.003025 -0.003863 0.000672 -0.002043 -0.003941 0.007372
room type encoded 0.000588 1.000000 0.005816 0.003934 0.003286 -0.054535 -0.009870 -0.001695 0.026484 0.003465 -0.069890
construction year 0.002866 0.005816 1.000000 -0.005168 -0.005040 -0.002781 -0.006711 0.000730 -0.003943 0.005685 -0.001876
price -0.001680 0.003934 -0.005168 1.000000 0.010493 -0.002520 0.003136 -0.000974 -0.001183 0.003545 0.001896
service fee 0.003341 0.003286 -0.005040 0.010493 1.000000 0.000070 -0.003929 -0.000802 -0.002356 -0.000335 -0.004418
minimum nights -0.003025 -0.054535 -0.002781 -0.002520 0.000070 1.000000 0.056980 -0.047902 -0.001105 -0.000551 0.013433
availability 365 -0.003863 -0.009870 -0.006711 0.003136 -0.003929 0.056980 1.000000 0.093556 0.066280 0.000912 -0.016052
number of reviews 0.000672 -0.001695 0.000730 -0.000974 -0.000802 -0.047902 0.093556 1.000000 0.022066 -0.002430 -0.044921
neighbourhood group encoded -0.002043 0.026484 -0.003943 -0.001183 -0.002356 -0.001105 0.066280 0.022066 1.000000 0.002763 0.082672
cancellation_policy encoded -0.003941 0.003465 0.005685 0.003545 -0.000335 -0.000551 0.000912 -0.002430 0.002763 1.000000 0.003253
neighbourhood encoded 0.007372 -0.069890 -0.001876 0.001896 -0.004418 0.013433 -0.016052 -0.044921 0.082672 0.003253 1.000000
In [71]:
# Se logra evidenciar que no hay variables altamente relacionadas a la variable Number of Reviews. Vamos a seleccionar las que esten en estén entre
# de 0.0099999 y -0.0099999 para el modelo de regressión Lasso (necesita pocas variales). El modelo completo será usado para el modelo de regresión 
# Lineal y el Ridge

LosAlpesHistoryDFVlasso = LosAlpesHistorydf[['room type encoded','minimum nights','availability 365','number of reviews','neighbourhood group encoded','cancellation_policy encoded','neighbourhood encoded']]
In [72]:
# Generaremos un informe sobre el DF de la regresión Lineal y la regresión Ridge 
profile = ProfileReport(LosAlpesHistoryDFV2)
profile.to_notebook_iframe()
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
In [73]:
LosAlpesHistoryDFV2['number of reviews'].describe(percentiles = [.25, .5, .75, .95, .99])
Out[73]:
count    94973.000000
mean        27.545271
std         50.036680
min          0.000000
25%          1.000000
50%          7.000000
75%         30.000000
95%        126.000000
99%        235.000000
max       1024.000000
Name: number of reviews, dtype: float64
In [74]:
plt.figure(figsize = (28, 3))
plt.boxplot(LosAlpesHistoryDFV2['number of reviews'], vert = False)
plt.show()
In [75]:
# Ahora pasaremos a definir el set de entrenamiento y set de pruebas

lin_reg = LinearRegression()
x = LosAlpesHistoryDFV2[['instant_bookable', 'room type encoded','construction year','price','service fee','minimum nights','availability 365','neighbourhood group encoded','cancellation_policy encoded','neighbourhood encoded']] 
y = LosAlpesHistoryDFV2['number of reviews']
In [76]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)
In [77]:
# Escalando caracteristicas
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [78]:
regr = LinearRegression()
regr.fit(X_train_scaled, Y_train)
Out[78]:
LinearRegression()
In [79]:
regr.coef_
Out[79]:
array([-0.01528907, -0.17061288,  0.13195494, -0.07941148, -0.05112678,
       -2.42945799,  4.75721361,  0.96776219, -0.21131593, -2.27824987])
In [80]:
regr.intercept_
Out[80]:
27.62672041635956
In [81]:
# Evaluación del Modelo
In [82]:
preds_train = regr.predict(X_train_scaled)
preds_test = regr.predict(X_test_scaled)
In [83]:
MAE_LinealRegretion=mean_absolute_error(Y_train, preds_train), mean_absolute_error(Y_test, preds_test)
MAE_LinealRegretion
Out[83]:
(31.093547988783314, 30.85671681609628)
In [84]:
RMSE_LinealRegretion=np.sqrt(mean_squared_error(Y_train, preds_train)), np.sqrt(mean_squared_error(Y_test, preds_test))
RMSE_LinealRegretion
Out[84]:
(49.753600629050645, 49.546126228626306)
In [85]:
plt.figure(figsize = (28, 3))
plt.boxplot((Y_test - preds_test), vert = False)
plt.grid()
plt.show()
In [86]:
worst_condition = np.where((Y_test - preds_test) > 3, True, False)
In [87]:
worst_cases = pd.concat([
    X_test.loc[worst_condition],
    Y_test.loc[worst_condition]
], axis = 1).reset_index(drop = True)
In [88]:
worst_cases['predictions'] = pd.Series(preds_test).loc[worst_condition].reset_index(drop = True)
In [89]:
worst_cases.shape
Out[89]:
(7025, 12)
In [90]:
worst_cases
Out[90]:
instant_bookable room type encoded construction year price service fee minimum nights availability 365 neighbourhood group encoded cancellation_policy encoded neighbourhood encoded number of reviews predictions
0 False 0 2008.0 703.00 141.0 2.0 44.0 4 2 167 30.0 24.702726
1 True 3 2015.0 1.02 204.0 2.0 33.0 1 2 28 90.0 26.037481
2 True 0 2012.0 698.00 140.0 2.0 60.0 2 2 62 67.0 26.914470
3 True 3 2012.0 782.00 156.0 30.0 363.0 1 3 13 59.0 35.774765
4 False 3 2005.0 100.00 20.0 1.0 345.0 1 1 217 55.0 31.060535
... ... ... ... ... ... ... ... ... ... ... ... ...
7020 True 3 2013.0 1.10 219.0 1.0 362.0 4 3 107 102.0 37.711804
7021 True 0 2020.0 212.00 42.0 3.0 53.0 5 3 168 79.0 25.970045
7022 False 0 2006.0 370.00 74.0 4.0 1.0 1 1 13 239.0 25.590953
7023 False 0 2017.0 449.00 90.0 4.0 6.0 2 2 205 89.0 20.371231
7024 True 3 2010.0 420.00 84.0 3.0 136.0 1 1 162 169.0 25.286041

7025 rows × 12 columns

In [91]:
poly_features = PolynomialFeatures(degree=3,include_bias=False)
x_poly = poly_features.fit_transform(X_train)
In [92]:
ridge_df = Ridge(alpha=1)
ridge_df.fit(x_poly,Y_train)
df_poly_training=ridge_df.predict(x_poly)
x_poly_test = poly_features.fit_transform(X_test)
ridge_df_test = Ridge(alpha=1)
ridge_df_test.fit(x_poly_test,Y_test)
Out[92]:
Ridge(alpha=1)
In [93]:
df_poly_test=ridge_df_test.predict(x_poly_test)
MAE_RidgeRegretion=mean_absolute_error(Y_train, df_poly_training), mean_absolute_error(Y_test, df_poly_test)
MAE_RidgeRegretion
Out[93]:
(29.54641094681216, 29.06540975928463)
In [94]:
RMSE_RidgeRegretion=np.sqrt(mean_squared_error(Y_train, df_poly_training)), np.sqrt(mean_squared_error(Y_test, df_poly_test))
RMSE_RidgeRegretion
Out[94]:
(48.41898641139969, 48.0932730258384)
In [95]:
# Para lasso, debemos utilizar un set de variables más acotadas. Así que utilizaremos los campos definidos en el DF LosAlpesHistoryDFVlasso
In [96]:
xLasso = LosAlpesHistoryDFV2[['room type encoded','minimum nights','availability 365','neighbourhood group encoded','cancellation_policy encoded','neighbourhood encoded']]
yLasso = LosAlpesHistoryDFV2['number of reviews']
In [97]:
X_train, X_test, Y_train, Y_test = train_test_split(xLasso, yLasso, test_size = 0.3, random_state = 1)

lasso_features = PolynomialFeatures(degree=3,include_bias=False)
x_lasso = lasso_features.fit_transform(X_train)

lasso_df = Lasso(alpha=5)
lasso_df.fit(x_lasso,Y_train)
df_lasso_training=lasso_df.predict(x_lasso)
x_lasso_test = lasso_features.fit_transform(X_test)
lasso_df_test = Lasso(alpha=5)
lasso_df_test.fit(x_lasso_test,Y_test)
Out[97]:
Lasso(alpha=5)
In [98]:
df_lasso_test=lasso_df_test.predict(x_lasso_test)
MAE_Laso=mean_absolute_error(Y_train, df_lasso_training), mean_absolute_error(Y_test, df_lasso_test)
MAE_Laso
Out[98]:
(29.834082922903125, 29.311766051029927)
In [99]:
RMSE_Lasso=np.sqrt(mean_squared_error(Y_train, df_lasso_training)), np.sqrt(mean_squared_error(Y_test, df_lasso_test))
RMSE_Lasso
Out[99]:
(48.74487729665361, 48.40488986379656)
In [100]:
# Finalmente compararemos los tres modelos con las variables almacenadas 
In [101]:
MAE_LinealRegretion, MAE_RidgeRegretion, MAE_Laso
Out[101]:
((31.093547988783314, 30.85671681609628),
 (29.54641094681216, 29.06540975928463),
 (29.834082922903125, 29.311766051029927))
In [102]:
RMSE_LinealRegretion, RMSE_RidgeRegretion, RMSE_Lasso
Out[102]:
((49.753600629050645, 49.546126228626306),
 (48.41898641139969, 48.0932730258384),
 (48.74487729665361, 48.40488986379656))
In [103]:
# Dado que no existe grandes diferencias, como resultado de la experimentación entre LASO y RIDGE, tome la decisión de predecir el modelo con RIDGE dado que me permite 
# utilizar la totalidad de variables del dataset nuevo. La diferencia entre lo entrenado y el test en MAE del Ridge es apenas 0,48 y en LASO es 0,34. Insisto, se que LASO es un poco mejor pero 
# sacrificaré eso para poder utilizar todas las columnas del nuevo dataset.
In [104]:
# PUNTO 3: Utilice el mejor modelo obtenido para estimar la popularidad de los inmuebles próximos a publicarse. A partir de este resultado, apoye a la 
# inmobiliaria en la definición del presupuesto de marketing que debe aprovisionar para promocionar los inmuebles menos populares sabiendo que la 
# plataforma en la que se publican cobra por anuncio el 2% del precio del inmueble. Por ejemplo, si un inmueble tiene como precio $200, la plataforma
# cobrará por publicar un anuncio para el inmueble un total de $4. Usted es el encargado,a partir de los resultados de las predicciones, de definir el
# límite superior de comentarios para considerar un inmueble no popular. Explique esta decisión claramente así como el total de presupuesto que se 
# debe aprovisionar acompañado del total de inmuebles a promocionar
In [105]:
# Vamos a iniciar realizando los mismos 11 pasos que se hicieron en la transformación de datos del primer dataFrame. 
LosAlpesNewdf = pd.read_csv(r"C:\Users\LGARCIA\OneDrive - Valorem\LGARCIA\Documents\04. Estudio\05. Maestria\Ciencia Aplicada de Datos\CAD_Parcial1\Data\losalpes_new.csv")
LosAlpesNewdf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 513 entries, 0 to 512
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   513 non-null    int64  
 1   neighbourhood group  512 non-null    object 
 2   neighbourhood        512 non-null    object 
 3   lat                  512 non-null    float64
 4   long                 512 non-null    float64
 5   country              507 non-null    object 
 6   instant_bookable     512 non-null    object 
 7   cancellation_policy  512 non-null    object 
 8   room type            512 non-null    object 
 9   construction year    512 non-null    float64
 10  price                512 non-null    object 
 11  service fee          511 non-null    object 
 12  minimum nights       510 non-null    float64
 13  availability 365     511 non-null    float64
dtypes: float64(5), int64(1), object(8)
memory usage: 56.2+ KB
In [106]:
LosAlpesNewdf['neighbourhood group'].unique()
Out[106]:
array(['Queens', 'Manhattan', 'Brooklyn', 'Bronx', 'Staten Island', nan],
      dtype=object)
In [107]:
LosAlpesNewdf['neighbourhood group']=LosAlpesNewdf['neighbourhood group'].replace(np.nan, 'No Information')
LosAlpesNewdf['neighbourhood group']=LosAlpesNewdf['neighbourhood group'].apply(corregirneighbourhoodGroup1)
LosAlpesNewdf['neighbourhood group']=LosAlpesNewdf['neighbourhood group'].apply(corregirneighbourhoodGroup2)
LosAlpesNewdf['neighbourhood group'].unique()
Out[107]:
array(['Queens', 'Manhattan', 'Brooklyn', 'Bronx', 'State Island',
       'No Information'], dtype=object)
In [108]:
LosAlpesNewdf['neighbourhood']=LosAlpesNewdf['neighbourhood'].str.strip()
In [109]:
LosAlpesNewdf['country']="United States"
LosAlpesNewdf['instant_bookable']=LosAlpesNewdf['instant_bookable'].replace(np.nan, False)
LosAlpesNewdf['cancellation_policy']=LosAlpesNewdf['cancellation_policy'].replace(np.nan, "No Information")
LosAlpesNewdf['room type']=LosAlpesNewdf['room type'].replace(np.nan, "No Information")
In [110]:
LosAlpesNewdf['construction year']=LosAlpesNewdf['construction year'].apply(corregirAño)
In [111]:
LosAlpesNewdf['price']=LosAlpesNewdf['price'].str.slice_replace(stop=2, repl='')
LosAlpesNewdf['price']=LosAlpesNewdf['price'].replace(np.nan, '0.0')
LosAlpesNewdf['price']=LosAlpesNewdf['price'].replace(',', '.',regex=True)
LosAlpesNewdf['price']=LosAlpesNewdf['price'].astype(float, errors = 'raise')
LosAlpesNewdf['price'].unique()
Out[111]:
array([519.  , 987.  , 999.  , 497.  , 820.  ,   1.11, 913.  , 238.  ,
       930.  , 822.  , 901.  , 249.  , 997.  , 888.  , 264.  , 263.  ,
       288.  , 144.  , 124.  , 352.  ,  89.  , 231.  ,   1.19, 649.  ,
       427.  , 615.  , 659.  , 502.  ,   1.15, 827.  , 695.  , 129.  ,
         1.17,   1.  , 943.  , 155.  , 842.  , 150.  , 984.  , 432.  ,
       664.  , 422.  , 613.  , 418.  ,   1.02, 118.  , 931.  , 177.  ,
       546.  , 793.  ,   1.01, 157.  , 559.  , 158.  , 898.  ,   1.12,
         1.09, 700.  ,   1.1 , 614.  , 965.  , 527.  ,   1.04, 736.  ,
       312.  , 274.  , 308.  , 642.  , 329.  , 488.  , 592.  ,   1.2 ,
       240.  , 380.  , 802.  ,   1.05, 672.  ,   1.06, 645.  ,   1.07,
       843.  , 761.  , 634.  , 808.  , 192.  , 401.  , 367.  , 171.  ,
       445.  , 216.  , 115.  , 114.  ,   1.18, 990.  , 500.  , 394.  ,
       701.  , 783.  , 307.  , 409.  , 103.  , 693.  , 284.  , 481.  ,
       662.  , 937.  , 838.  , 716.  , 976.  , 159.  , 834.  , 339.  ,
       348.  , 951.  ,  75.  , 925.  , 139.  , 840.  , 689.  , 720.  ,
       749.  , 601.  , 928.  , 327.  , 964.  , 289.  , 137.  ,   1.08,
       472.  , 168.  , 958.  , 622.  , 766.  , 323.  , 554.  , 967.  ,
       300.  ,  56.  , 265.  ,   1.14, 703.  , 146.  , 625.  , 596.  ,
       724.  , 821.  , 166.  , 214.  , 705.  , 854.  , 201.  , 444.  ,
       261.  , 117.  , 490.  , 926.  , 746.  , 977.  , 515.  , 172.  ,
       234.  , 813.  , 884.  , 354.  , 578.  , 521.  , 303.  , 453.  ,
       215.  , 602.  , 867.  , 814.  , 244.  , 737.  , 589.  ,   1.16,
       544.  , 795.  , 807.  ,  70.  , 383.  , 185.  , 536.  , 301.  ,
        61.  , 135.  , 668.  , 711.  , 403.  , 845.  , 597.  , 593.  ,
       525.  ,  64.  , 430.  , 341.  , 282.  , 829.  , 580.  , 290.  ,
       631.  , 873.  , 431.  ,  73.  , 969.  , 407.  , 326.  , 853.  ,
       207.  , 983.  , 337.  , 220.  , 357.  , 881.  ,  83.  , 851.  ,
       574.  , 902.  , 179.  , 893.  ,  50.  , 463.  , 322.  , 315.  ,
       346.  , 520.  , 286.  , 518.  , 489.  , 387.  , 398.  , 696.  ,
       687.  , 637.  , 599.  , 639.  , 606.  , 663.  , 636.  , 458.  ,
       304.  , 730.  , 498.  , 595.  , 332.  , 979.  ,  62.  , 228.  ,
       575.  ,   0.  , 621.  , 697.  , 495.  , 279.  , 197.  , 966.  ,
       465.  , 524.  , 547.  , 361.  , 970.  , 363.  , 141.  , 712.  ,
       459.  , 503.  ,  51.  , 423.  , 311.  , 506.  , 151.  ,   1.03,
       514.  , 694.  , 674.  , 774.  , 978.  , 392.  , 753.  , 100.  ,
         1.13, 562.  , 176.  , 619.  , 154.  , 143.  , 464.  , 877.  ,
       770.  , 651.  , 259.  ,  77.  , 174.  , 811.  , 470.  , 538.  ,
       438.  ,  68.  , 988.  ,  60.  , 287.  , 181.  , 309.  , 134.  ,
       944.  , 556.  , 208.  , 899.  , 864.  , 195.  , 803.  , 882.  ,
       478.  , 573.  , 219.  , 892.  , 390.  , 153.  , 833.  , 570.  ,
       870.  , 245.  , 400.  , 914.  , 785.  , 491.  , 678.  , 819.  ,
       773.  , 537.  , 273.  , 938.  , 837.  , 734.  , 440.  , 164.  ,
       230.  , 971.  , 358.  , 198.  , 751.  , 484.  , 948.  , 378.  ,
       366.  , 156.  , 828.  , 745.  , 775.  , 861.  , 345.  ,  69.  ,
       863.  , 342.  , 630.  , 110.  , 251.  ])
In [112]:
LosAlpesNewdf['service fee']=LosAlpesNewdf['service fee'].str.slice_replace(stop=2, repl='')
LosAlpesNewdf['service fee']=LosAlpesNewdf['service fee'].replace(np.nan, '0.0')
LosAlpesNewdf['service fee']=LosAlpesNewdf['service fee'].replace(',', '.',regex=True)
LosAlpesNewdf['service fee']=LosAlpesNewdf['service fee'].astype(float, errors = 'raise')
LosAlpesNewdf['service fee'].unique()
Out[112]:
array([104., 197., 200.,  99., 164., 221., 183.,  48., 186., 180.,  50.,
       199., 178.,  53.,  58.,  29.,  25.,  70.,  18.,  46., 237., 130.,
        85., 123., 132., 100., 230., 165., 139.,  26., 234., 189.,  31.,
       168.,  30.,  86., 133.,  84., 204.,  24.,  35., 109., 159., 203.,
       112.,  32., 225., 218., 140., 220., 193., 105., 208., 147.,  62.,
        55., 128.,  66.,  98., 118., 240.,  76., 160., 210., 134., 212.,
       129., 215., 169., 152., 127., 162.,  38.,  80.,  73.,  34.,  89.,
        43.,  23., 238., 236., 198.,  79., 157.,  61.,  82.,  21.,  57.,
        96., 187., 143., 195., 167., 222.,  68., 190.,  15., 185.,  28.,
       138., 144., 150., 120.,  65.,  27.,  94., 192., 124., 153., 111.,
        60.,  11., 228., 141., 224., 125., 119., 145., 214., 216.,  33.,
       171., 201.,  40.,  52., 149., 103., 202.,  47., 163., 177.,  71.,
       116.,  91., 173.,  49., 223., 232., 161., 219.,  14.,  77.,  37.,
       235., 107.,  12., 231., 142.,  81.,  13.,  56., 166., 126., 175.,
       239., 194.,  41.,  67.,  44., 176.,  17., 170., 115.,  36., 179.,
       233.,  10., 207.,  93.,  64.,  63.,  69., 137., 121.,  92., 146.,
       229., 196.,   0.,  39., 209.,  72., 101., 206., 135., 155.,  78.,
       151.,  20., 227., 154., 213., 108.,  88.,  42., 205., 114., 174.,
       136., 188.,  97., 172., 211.,  22.])
In [113]:
LosAlpesNewdf['minimum nights']=LosAlpesNewdf['minimum nights'].apply(corregirNegativo)
LosAlpesNewdf['availability 365']=LosAlpesNewdf['availability 365'].apply(corregirNegativo)
LosAlpesNewdf['availability 365']=LosAlpesNewdf['availability 365'].apply(corregirDisponibilidad)
In [114]:
LosAlpesNewdf.drop_duplicates(keep='first', inplace=True)
LosAlpesNewdf.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 513 entries, 0 to 512
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   513 non-null    int64  
 1   neighbourhood group  513 non-null    object 
 2   neighbourhood        512 non-null    object 
 3   lat                  512 non-null    float64
 4   long                 512 non-null    float64
 5   country              513 non-null    object 
 6   instant_bookable     513 non-null    bool   
 7   cancellation_policy  513 non-null    object 
 8   room type            513 non-null    object 
 9   construction year    513 non-null    float64
 10  price                513 non-null    float64
 11  service fee          513 non-null    float64
 12  minimum nights       513 non-null    float64
 13  availability 365     513 non-null    float64
dtypes: bool(1), float64(7), int64(1), object(5)
memory usage: 56.6+ KB
In [115]:
LosAlpesNewdf['neighbourhood group encoded']=le.fit_transform(LosAlpesNewdf['neighbourhood group'])
LosAlpesNewdf['neighbourhood encoded']=le.fit_transform(LosAlpesNewdf['neighbourhood'])
LosAlpesNewdf['cancellation_policy encoded']=le.fit_transform(LosAlpesNewdf['cancellation_policy'])
LosAlpesNewdf['room type encoded']=le.fit_transform(LosAlpesNewdf['room type'])
In [116]:
LosAlpesNewdf.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 513 entries, 0 to 512
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           513 non-null    int64  
 1   neighbourhood group          513 non-null    object 
 2   neighbourhood                512 non-null    object 
 3   lat                          512 non-null    float64
 4   long                         512 non-null    float64
 5   country                      513 non-null    object 
 6   instant_bookable             513 non-null    bool   
 7   cancellation_policy          513 non-null    object 
 8   room type                    513 non-null    object 
 9   construction year            513 non-null    float64
 10  price                        513 non-null    float64
 11  service fee                  513 non-null    float64
 12  minimum nights               513 non-null    float64
 13  availability 365             513 non-null    float64
 14  neighbourhood group encoded  513 non-null    int32  
 15  neighbourhood encoded        513 non-null    int32  
 16  cancellation_policy encoded  513 non-null    int32  
 17  room type encoded            513 non-null    int32  
dtypes: bool(1), float64(7), int32(4), int64(1), object(5)
memory usage: 64.6+ KB
In [117]:
PredictionDF = LosAlpesNewdf[['instant_bookable', 'room type encoded','construction year','price','service fee','minimum nights','availability 365','neighbourhood group encoded','cancellation_policy encoded','neighbourhood encoded']]
In [118]:
PredictionDF.drop_duplicates(keep='first', inplace=True)
In [119]:
PredictionDF.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 513 entries, 0 to 512
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   instant_bookable             513 non-null    bool   
 1   room type encoded            513 non-null    int32  
 2   construction year            513 non-null    float64
 3   price                        513 non-null    float64
 4   service fee                  513 non-null    float64
 5   minimum nights               513 non-null    float64
 6   availability 365             513 non-null    float64
 7   neighbourhood group encoded  513 non-null    int32  
 8   cancellation_policy encoded  513 non-null    int32  
 9   neighbourhood encoded        513 non-null    int32  
dtypes: bool(1), float64(5), int32(4)
memory usage: 32.6 KB
In [120]:
# En este punto utilizaré la predicción Ridge, de acuerdo a lo expuesto en el punto anterior. Podría utilizar laso recortando la cantidad de variables que vamos a usar. 

x_ridge_predict = poly_features.fit_transform(PredictionDF)
df_ridge_predict=ridge_df.predict(x_ridge_predict)
In [121]:
PredictionDF['Predicted Reviews']=df_ridge_predict
PredictionDF['Predicted Reviews']=PredictionDF['Predicted Reviews'].astype(int, errors = 'raise')
In [122]:
PredictionDF.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 513 entries, 0 to 512
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   instant_bookable             513 non-null    bool   
 1   room type encoded            513 non-null    int32  
 2   construction year            513 non-null    float64
 3   price                        513 non-null    float64
 4   service fee                  513 non-null    float64
 5   minimum nights               513 non-null    float64
 6   availability 365             513 non-null    float64
 7   neighbourhood group encoded  513 non-null    int32  
 8   cancellation_policy encoded  513 non-null    int32  
 9   neighbourhood encoded        513 non-null    int32  
 10  Predicted Reviews            513 non-null    int32  
dtypes: bool(1), float64(5), int32(5)
memory usage: 34.6 KB
In [123]:
# Depuro un poco los datos predichos, quitandole ceros.

PredictionDF['Predicted Reviews']=PredictionDF['Predicted Reviews'].apply(corregirNegativo)
PredictionDF.sort_values('Predicted Reviews', ascending=True)
Out[123]:
instant_bookable room type encoded construction year price service fee minimum nights availability 365 neighbourhood group encoded cancellation_policy encoded neighbourhood encoded Predicted Reviews
431 True 3 2022.0 892.00 178.0 91.0 34.0 2 2 61 0
339 False 3 2010.0 547.00 109.0 1.0 0.0 5 2 31 0
33 True 3 2017.0 1.17 234.0 3.0 0.0 1 3 21 3
43 True 3 2009.0 613.00 123.0 30.0 0.0 4 3 20 5
426 True 4 2009.0 882.00 176.0 1.0 0.0 4 3 1 7
... ... ... ... ... ... ... ... ... ... ... ...
165 True 0 2010.0 166.00 33.0 2.0 301.0 0 2 65 57
14 True 0 2017.0 888.00 178.0 3.0 166.0 1 3 5 60
346 True 1 2003.0 1.11 221.0 1.0 346.0 2 3 89 104
176 False 1 2019.0 490.00 98.0 1.0 282.0 2 2 89 110
323 False 2 2002.0 0.00 0.0 0.0 0.0 3 0 94 114

513 rows × 11 columns

In [124]:
# Incluyo una nueva columna que será el costo de publicitar el inmueble, que es el 2% del valor.  Tambien saco la media de los valores predichos y todo inmueble que no supere este rango, va a denominarse
# inmueble poco popular.

PredictionDF['Costo Publicacion']=PredictionDF['price']*0.02
vMean = PredictionDF['Predicted Reviews'].mean()
In [125]:
vMean
Out[125]:
29.621832358674464
In [126]:
PredictionDF
Out[126]:
instant_bookable room type encoded construction year price service fee minimum nights availability 365 neighbourhood group encoded cancellation_policy encoded neighbourhood encoded Predicted Reviews Costo Publicacion
0 False 3 2006.0 519.00 104.0 1.0 0.0 4 2 63 20 10.3800
1 False 0 2006.0 987.00 197.0 2.0 42.0 2 2 15 33 19.7400
2 True 3 2018.0 999.00 200.0 2.0 0.0 4 1 24 19 19.9800
3 True 3 2011.0 497.00 99.0 4.0 0.0 2 3 38 19 9.9400
4 True 0 2022.0 820.00 164.0 4.0 365.0 2 3 85 24 16.4000
... ... ... ... ... ... ... ... ... ... ... ... ...
508 False 3 2014.0 103.00 21.0 2.0 0.0 1 2 3 10 2.0600
509 True 3 2008.0 110.00 22.0 2.0 21.0 1 2 3 20 2.2000
510 False 0 2022.0 251.00 50.0 30.0 335.0 1 3 90 27 5.0200
511 True 3 2022.0 1.01 201.0 1.0 41.0 1 3 35 28 0.0202
512 True 3 2016.0 1.05 209.0 1.0 162.0 4 3 71 52 0.0210

513 rows × 12 columns

In [127]:
# Al final creo un nuevo dataframe con los inmuebles que poco populares (numero de reviews menores a la media)

flt = PredictionDF['Predicted Reviews']<vMean
InmuebleaPublicarDF = PredictionDF[flt]
In [128]:
InmuebleaPublicarDF
Out[128]:
instant_bookable room type encoded construction year price service fee minimum nights availability 365 neighbourhood group encoded cancellation_policy encoded neighbourhood encoded Predicted Reviews Costo Publicacion
0 False 3 2006.0 519.00 104.0 1.0 0.0 4 2 63 20 10.3800
2 True 3 2018.0 999.00 200.0 2.0 0.0 4 1 24 19 19.9800
3 True 3 2011.0 497.00 99.0 4.0 0.0 2 3 38 19 9.9400
4 True 0 2022.0 820.00 164.0 4.0 365.0 2 3 85 24 16.4000
6 True 0 2014.0 913.00 183.0 7.0 0.0 1 2 90 15 18.2600
... ... ... ... ... ... ... ... ... ... ... ... ...
507 False 4 2009.0 630.00 126.0 27.0 365.0 2 2 13 10 12.6000
508 False 3 2014.0 103.00 21.0 2.0 0.0 1 2 3 10 2.0600
509 True 3 2008.0 110.00 22.0 2.0 21.0 1 2 3 20 2.2000
510 False 0 2022.0 251.00 50.0 30.0 335.0 1 3 90 27 5.0200
511 True 3 2022.0 1.01 201.0 1.0 41.0 1 3 35 28 0.0202

266 rows × 12 columns

In [129]:
# para encontrar el presupuesto del area, sumo el costo y cuento la cantidad de lineas del nuevo dataframe.
Presupuesto=InmuebleaPublicarDF['Costo Publicacion'].sum()
In [130]:
Presupuesto
Out[130]:
2249.8884
In [131]:
InmuebleaPublicarDF.shape
Out[131]:
(266, 12)
In [132]:
# La respuesta es: Presupuesto: 2250 dolares para publicitar 266 inmuebles menos populares.